You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2021/03/20 08:18:36 UTC

svn commit: r1887840 - in /manifoldcf/trunk: CHANGES.txt connectors/elasticsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/elasticsearch/ElasticSearchConnector.java

Author: kwright
Date: Sat Mar 20 08:18:36 2021
New Revision: 1887840

URL: http://svn.apache.org/viewvc?rev=1887840&view=rev
Log:
Fix for CONNECTORS-1666.

Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/elasticsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/elasticsearch/ElasticSearchConnector.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1887840&r1=1887839&r2=1887840&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Sat Mar 20 08:18:36 2021
@@ -3,6 +3,11 @@ $Id$
 
 ======================= 2.19-dev =====================
 
+CONNECTORS-1666: Elastic Search connector now limits size of URI according to
+ES rules.  Standard MCF hashing is done to insure this, but only if the documentURI
+is large enough to warrant it.
+(Shirai Takashi, Karl Wright)
+
 CONNECTORS-1656: Ensure legit XML is produced for Tika by the html extractor.
 (Julien Massiera)
 

Modified: manifoldcf/trunk/connectors/elasticsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/elasticsearch/ElasticSearchConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/elasticsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/elasticsearch/ElasticSearchConnector.java?rev=1887840&r1=1887839&r2=1887840&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/elasticsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/elasticsearch/ElasticSearchConnector.java (original)
+++ manifoldcf/trunk/connectors/elasticsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/elasticsearch/ElasticSearchConnector.java Sat Mar 20 08:18:36 2021
@@ -62,12 +62,11 @@ import org.apache.manifoldcf.agents.outp
 import org.apache.manifoldcf.agents.output.elasticsearch.ElasticSearchConnection.Result;
 import org.apache.manifoldcf.core.interfaces.Specification;
 import org.apache.manifoldcf.core.interfaces.ConfigParams;
-//import org.apache.manifoldcf.core.interfaces.ConfigurationNode;
 import org.apache.manifoldcf.core.interfaces.IHTTPOutput;
 import org.apache.manifoldcf.core.interfaces.IPostParameters;
 import org.apache.manifoldcf.core.interfaces.IThreadContext;
 import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
-//import org.apache.manifoldcf.core.interfaces.SpecificationNode;
+import org.apache.manifoldcf.core.system.ManifoldCF;
 import org.apache.manifoldcf.core.interfaces.VersionContext;
 import org.apache.manifoldcf.connectorcommon.interfaces.IKeystoreManager;
 import org.apache.manifoldcf.connectorcommon.common.InterruptibleSocketFactory;
@@ -103,6 +102,9 @@ public class ElasticSearchConnector exte
   /** Connection expiration interval */
   private static final long EXPIRATION_INTERVAL = 60000L;
 
+  /** ID length is limited, or you get the HTTP 400 error */
+  private static final int maxIdLength = 512;
+
   private HttpClientConnectionManager connectionManager = null;
   private HttpClient client = null;
   private long expirationTime = -1L;
@@ -200,7 +202,21 @@ public class ElasticSearchConnector exte
     client = null;
     expirationTime = -1L;
   }
-  
+
+  /** Create a hashed URI string for ID according to ElasticSearch parameters.
+  * @param domumentURI is the URI of the document.
+  * @return hashed URI.
+  */
+  protected static String compressDocumentURI(String documentURI)
+    throws ManifoldCFException
+  {
+    // If the ID is too long, we must do things to reduce its size.  This involves hashing, but
+    // for backwards compatibility we only do it if it is too long.
+    if (documentURI.length() <= maxIdLength)
+      return documentURI;
+    return ManifoldCF.hash(documentURI);
+  }
+
   /** This method is called to assess whether to count this connector instance should
   * actually be counted as being connected.
   *@return true if the connector instance is actually connected.
@@ -375,6 +391,7 @@ public class ElasticSearchConnector exte
       IOutputAddActivity activities) throws ManifoldCFException,
       ServiceInterruption, IOException
   {
+    String compressedDocumentURI = compressDocumentURI(documentURI);
     HttpClient client = getSession();
     ElasticSearchConfig config = getConfigParameters(null);
 
@@ -411,7 +428,7 @@ public class ElasticSearchConnector exte
       else
       {
         // Don't know how to deal with it
-        activities.recordActivity(null,ELASTICSEARCH_INDEXATION_ACTIVITY,document.getBinaryLength(),documentURI,activities.UNKNOWN_SECURITY,"Rejected document that has security info which ElasticSearch does not recognize: '"+ securityType + "'");
+        activities.recordActivity(null, ELASTICSEARCH_INDEXATION_ACTIVITY, document.getBinaryLength(), documentURI, activities.UNKNOWN_SECURITY, "Rejected document that has security info which ElasticSearch does not recognize: '"+ securityType + "'");
         return DOCUMENTSTATUS_REJECTED;
       }
     }
@@ -420,7 +437,7 @@ public class ElasticSearchConnector exte
     ElasticSearchIndex oi = new ElasticSearchIndex(client, config);
     try
     {
-      oi.execute(documentURI, document, inputStream, acls, denyAcls, shareAcls, shareDenyAcls, parentAcls, parentDenyAcls);
+      oi.execute(compressedDocumentURI, document, inputStream, acls, denyAcls, shareAcls, shareDenyAcls, parentAcls, parentDenyAcls);
       if (oi.getResult() != Result.OK)
         return DOCUMENTSTATUS_REJECTED;
       return DOCUMENTSTATUS_ACCEPTED;
@@ -437,12 +454,13 @@ public class ElasticSearchConnector exte
       IOutputRemoveActivity activities) throws ManifoldCFException,
       ServiceInterruption
   {
+    String compressedDocumentURI = compressDocumentURI(documentURI);
     HttpClient client = getSession();
     long startTime = System.currentTimeMillis();
     ElasticSearchDelete od = new ElasticSearchDelete(client, getConfigParameters(null));
     try
     {
-      od.execute(documentURI);
+      od.execute(compressedDocumentURI);
     }
     finally
     {