You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2021/03/20 08:18:36 UTC
svn commit: r1887840 - in /manifoldcf/trunk: CHANGES.txt
connectors/elasticsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/elasticsearch/ElasticSearchConnector.java
Author: kwright
Date: Sat Mar 20 08:18:36 2021
New Revision: 1887840
URL: http://svn.apache.org/viewvc?rev=1887840&view=rev
Log:
Fix for CONNECTORS-1666.
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/elasticsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/elasticsearch/ElasticSearchConnector.java
Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1887840&r1=1887839&r2=1887840&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Sat Mar 20 08:18:36 2021
@@ -3,6 +3,11 @@ $Id$
======================= 2.19-dev =====================
+CONNECTORS-1666: Elastic Search connector now limits size of URI according to
+ES rules. Standard MCF hashing is done to insure this, but only if the documentURI
+is large enough to warrant it.
+(Shirai Takashi, Karl Wright)
+
CONNECTORS-1656: Ensure legit XML is produced for Tika by the html extractor.
(Julien Massiera)
Modified: manifoldcf/trunk/connectors/elasticsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/elasticsearch/ElasticSearchConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/elasticsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/elasticsearch/ElasticSearchConnector.java?rev=1887840&r1=1887839&r2=1887840&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/elasticsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/elasticsearch/ElasticSearchConnector.java (original)
+++ manifoldcf/trunk/connectors/elasticsearch/connector/src/main/java/org/apache/manifoldcf/agents/output/elasticsearch/ElasticSearchConnector.java Sat Mar 20 08:18:36 2021
@@ -62,12 +62,11 @@ import org.apache.manifoldcf.agents.outp
import org.apache.manifoldcf.agents.output.elasticsearch.ElasticSearchConnection.Result;
import org.apache.manifoldcf.core.interfaces.Specification;
import org.apache.manifoldcf.core.interfaces.ConfigParams;
-//import org.apache.manifoldcf.core.interfaces.ConfigurationNode;
import org.apache.manifoldcf.core.interfaces.IHTTPOutput;
import org.apache.manifoldcf.core.interfaces.IPostParameters;
import org.apache.manifoldcf.core.interfaces.IThreadContext;
import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
-//import org.apache.manifoldcf.core.interfaces.SpecificationNode;
+import org.apache.manifoldcf.core.system.ManifoldCF;
import org.apache.manifoldcf.core.interfaces.VersionContext;
import org.apache.manifoldcf.connectorcommon.interfaces.IKeystoreManager;
import org.apache.manifoldcf.connectorcommon.common.InterruptibleSocketFactory;
@@ -103,6 +102,9 @@ public class ElasticSearchConnector exte
/** Connection expiration interval */
private static final long EXPIRATION_INTERVAL = 60000L;
+ /** ID length is limited, or you get the HTTP 400 error */
+ private static final int maxIdLength = 512;
+
private HttpClientConnectionManager connectionManager = null;
private HttpClient client = null;
private long expirationTime = -1L;
@@ -200,7 +202,21 @@ public class ElasticSearchConnector exte
client = null;
expirationTime = -1L;
}
-
+
+ /** Create a hashed URI string for ID according to ElasticSearch parameters.
+ * @param domumentURI is the URI of the document.
+ * @return hashed URI.
+ */
+ protected static String compressDocumentURI(String documentURI)
+ throws ManifoldCFException
+ {
+ // If the ID is too long, we must do things to reduce its size. This involves hashing, but
+ // for backwards compatibility we only do it if it is too long.
+ if (documentURI.length() <= maxIdLength)
+ return documentURI;
+ return ManifoldCF.hash(documentURI);
+ }
+
/** This method is called to assess whether to count this connector instance should
* actually be counted as being connected.
*@return true if the connector instance is actually connected.
@@ -375,6 +391,7 @@ public class ElasticSearchConnector exte
IOutputAddActivity activities) throws ManifoldCFException,
ServiceInterruption, IOException
{
+ String compressedDocumentURI = compressDocumentURI(documentURI);
HttpClient client = getSession();
ElasticSearchConfig config = getConfigParameters(null);
@@ -411,7 +428,7 @@ public class ElasticSearchConnector exte
else
{
// Don't know how to deal with it
- activities.recordActivity(null,ELASTICSEARCH_INDEXATION_ACTIVITY,document.getBinaryLength(),documentURI,activities.UNKNOWN_SECURITY,"Rejected document that has security info which ElasticSearch does not recognize: '"+ securityType + "'");
+ activities.recordActivity(null, ELASTICSEARCH_INDEXATION_ACTIVITY, document.getBinaryLength(), documentURI, activities.UNKNOWN_SECURITY, "Rejected document that has security info which ElasticSearch does not recognize: '"+ securityType + "'");
return DOCUMENTSTATUS_REJECTED;
}
}
@@ -420,7 +437,7 @@ public class ElasticSearchConnector exte
ElasticSearchIndex oi = new ElasticSearchIndex(client, config);
try
{
- oi.execute(documentURI, document, inputStream, acls, denyAcls, shareAcls, shareDenyAcls, parentAcls, parentDenyAcls);
+ oi.execute(compressedDocumentURI, document, inputStream, acls, denyAcls, shareAcls, shareDenyAcls, parentAcls, parentDenyAcls);
if (oi.getResult() != Result.OK)
return DOCUMENTSTATUS_REJECTED;
return DOCUMENTSTATUS_ACCEPTED;
@@ -437,12 +454,13 @@ public class ElasticSearchConnector exte
IOutputRemoveActivity activities) throws ManifoldCFException,
ServiceInterruption
{
+ String compressedDocumentURI = compressDocumentURI(documentURI);
HttpClient client = getSession();
long startTime = System.currentTimeMillis();
ElasticSearchDelete od = new ElasticSearchDelete(client, getConfigParameters(null));
try
{
- od.execute(documentURI);
+ od.execute(compressedDocumentURI);
}
finally
{