You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2019/08/23 11:44:44 UTC
svn commit: r1865744 - in /manifoldcf/trunk: CHANGES.txt
connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
Author: kwright
Date: Fri Aug 23 11:44:43 2019
New Revision: 1865744
URL: http://svn.apache.org/viewvc?rev=1865744&view=rev
Log:
Fix for CONNECTORS-1621.
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1865744&r1=1865743&r2=1865744&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri Aug 23 11:44:43 2019
@@ -3,6 +3,9 @@ $Id$
======================= 2.14-dev =====================
+CONNECTORS-1621: Fix broken ability to index Tika-extracted documents in Solr.
+(Markus Schuch, Karl Wright)
+
CONNECTORS-1620: Add ability to extract links from application/xml documents.
(Markus Schuch)
Modified: manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java?rev=1865744&r1=1865743&r2=1865744&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java (original)
+++ manifoldcf/trunk/connectors/solr/connector/src/main/java/org/apache/manifoldcf/agents/output/solr/HttpPoster.java Fri Aug 23 11:44:43 2019
@@ -578,7 +578,9 @@ public class HttpPoster
}
// If not the right mime type, reject it.
- if ((includedMimeTypes !=null || excludedMimeTypes != null) && !checkMimeTypeIndexable(document.getMimeType(), useExtractUpdateHandler, includedMimeTypes, excludedMimeTypes)) {
+ // Note: this code added as part of CONNECTORS-1482 was incorrect! Document filtering specified in the solr connector is always against the
+ // ORIGINAL mime type (which is what's in the document). This why the checkMimeTypeIndexable second argument is always "true".
+ if ((includedMimeTypes !=null || excludedMimeTypes != null) && !checkMimeTypeIndexable(document.getMimeType(), true, includedMimeTypes, excludedMimeTypes)) {
activities.recordActivity(null,SolrConnector.INGEST_ACTIVITY,null,documentURI,activities.EXCLUDED_MIMETYPE,"Solr connector rejected document due to mime type restrictions: ("+document.getMimeType()+")");
return false;
}
@@ -812,9 +814,17 @@ public class HttpPoster
final String lowerMimeType = (mimeType==null)?null:mimeType.toLowerCase(Locale.ROOT);
if (useExtractUpdateHandler)
{
- if (includedMimeTypes != null && !includedMimeTypes.contains(lowerMimeType))
+ // Strip the charset off for this check
+ int index = lowerMimeType == null ? -1 : lowerMimeType.indexOf(";");
+ final String checkMimeType;
+ if (index != -1) {
+ checkMimeType = lowerMimeType.substring(0,index);
+ } else {
+ checkMimeType = lowerMimeType;
+ }
+ if (includedMimeTypes != null && !includedMimeTypes.contains(checkMimeType))
return false;
- if (excludedMimeTypes != null && excludedMimeTypes.contains(lowerMimeType))
+ if (excludedMimeTypes != null && excludedMimeTypes.contains(checkMimeType))
return false;
return true;
}