You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/10/29 12:59:38 UTC
svn commit: r1635106 - in /manifoldcf/branches/dev_1x: ./
connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/
framework/
framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/
framework/pull-agent...
Author: kwright
Date: Wed Oct 29 11:59:38 2014
New Revision: 1635106
URL: http://svn.apache.org/r1635106
Log:
Pull up CONNECTORS-1077 fix for GridFS from trunk
Modified:
manifoldcf/branches/dev_1x/ (props changed)
manifoldcf/branches/dev_1x/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java
manifoldcf/branches/dev_1x/framework/ (props changed)
manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java
manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java
Propchange: manifoldcf/branches/dev_1x/
------------------------------------------------------------------------------
Merged /manifoldcf/trunk:r1634373
Modified: manifoldcf/branches/dev_1x/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java?rev=1635106&r1=1635105&r2=1635106&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java (original)
+++ manifoldcf/branches/dev_1x/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java Wed Oct 29 11:59:38 2014
@@ -390,13 +390,13 @@ public class GridFSRepositoryConnector e
public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec,
IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
throws ManifoldCFException, ServiceInterruption {
-
+
for (String documentIdentifier : documentIdentifiers) {
-
+
String versionString;
GridFS gfs;
GridFSDBFile document;
-
+
getSession();
String _id = documentIdentifier;
gfs = new GridFS(session, bucket);
@@ -410,136 +410,154 @@ public class GridFSRepositoryConnector e
? Integer.toString(metadata.hashCode())
: StringUtils.EMPTY;
}
-
+
if (versionString.length() == 0 || activities.checkDocumentNeedsReindexing(documentIdentifier,versionString)) {
long startTime = System.currentTimeMillis();
- String errorCode = "OK";
+ String errorCode = null;
String errorDesc = null;
String version = versionString;
+ try {
- if (Logging.connectors.isDebugEnabled()) {
- Logging.connectors.debug("GridFS: Processing document _id = " + _id);
- }
-
- DBObject metadata = document.getMetaData();
- if (metadata == null) {
- Logging.connectors.warn("GridFS: Document " + _id + " has a null metadata - skipping.");
- activities.noDocument(_id,version);
- continue;
- }
+ if (Logging.connectors.isDebugEnabled()) {
+ Logging.connectors.debug("GridFS: Processing document _id = " + _id);
+ }
- String urlValue = document.getMetaData().get(this.url) == null
- ? StringUtils.EMPTY
- : document.getMetaData().get(this.url).toString();
- if (!StringUtils.isEmpty(urlValue)) {
- boolean validURL;
- try {
- new java.net.URI(urlValue);
- validURL = true;
- } catch (java.net.URISyntaxException e) {
- validURL = false;
+ DBObject metadata = document.getMetaData();
+ if (metadata == null) {
+ errorCode = "NULLMETADATA";
+ errorDesc = "Excluded because document had a null Metadata";
+ Logging.connectors.warn("GridFS: Document " + _id + " has a null metadata - skipping.");
+ activities.noDocument(_id, version);
+ continue;
}
- if (validURL) {
- long fileLenght = document.getLength();
- Date createdDate = document.getUploadDate();
- String fileName = document.getFilename();
- String mimeType = document.getContentType();
-
- if (!activities.checkURLIndexable(urlValue))
- {
- Logging.connectors.warn("GridFS: Document " + _id + " has a URL excluded by the output connector ('" + urlValue + "') - skipping.");
- activities.noDocument(_id, version);
- continue;
- }
-
- if (!activities.checkLengthIndexable(fileLenght))
- {
- Logging.connectors.warn("GridFS: Document " + _id + " has a length excluded by the output connector (" + fileLenght + ") - skipping.");
- activities.noDocument(_id, version);
- continue;
- }
-
- if (!activities.checkMimeTypeIndexable(mimeType))
- {
- Logging.connectors.warn("GridFS: Document " + _id + " has a mime type excluded by the output connector ('" + mimeType + "') - skipping.");
- activities.noDocument(_id, version);
- continue;
- }
-
- if (!activities.checkDateIndexable(createdDate))
- {
- Logging.connectors.warn("GridFS: Document " + _id + " has a date excluded by the output connector (" + createdDate + ") - skipping.");
- activities.noDocument(_id, version);
- continue;
+
+ String urlValue = document.getMetaData().get(this.url) == null
+ ? StringUtils.EMPTY
+ : document.getMetaData().get(this.url).toString();
+ if (!StringUtils.isEmpty(urlValue)) {
+ boolean validURL;
+ try {
+ new java.net.URI(urlValue);
+ validURL = true;
+ } catch (java.net.URISyntaxException e) {
+ validURL = false;
}
-
- RepositoryDocument rd = new RepositoryDocument();
- rd.setCreatedDate(createdDate);
- rd.setModifiedDate(createdDate);
- rd.setFileName(fileName);
- rd.setMimeType(mimeType);
- String[] aclsArray = null;
- String[] denyAclsArray = null;
- if (acl != null) {
- try {
- Object aclObject = document.getMetaData().get(acl);
- if (aclObject != null) {
- List<String> acls = (List<String>) aclObject;
- aclsArray = (String[]) acls.toArray();
+ if (validURL) {
+ long fileLenght = document.getLength();
+ Date createdDate = document.getUploadDate();
+ String fileName = document.getFilename();
+ String mimeType = document.getContentType();
+
+ if (!activities.checkURLIndexable(urlValue)) {
+ Logging.connectors.warn("GridFS: Document " + _id + " has a URL excluded by the output connector ('" + urlValue + "') - skipping.");
+ errorCode = activities.EXCLUDED_URL;
+ errorDesc = "Excluded because of URL (" + urlValue + ")";
+ activities.noDocument(_id, version);
+ continue;
+ }
+
+ if (!activities.checkLengthIndexable(fileLenght)) {
+ Logging.connectors.warn("GridFS: Document " + _id + " has a length excluded by the output connector (" + fileLenght + ") - skipping.");
+ errorCode = activities.EXCLUDED_LENGTH;
+ errorDesc = "Excluded because of length (" + fileLenght + ")";
+ activities.noDocument(_id, version);
+ continue;
+ }
+
+ if (!activities.checkMimeTypeIndexable(mimeType)) {
+ Logging.connectors.warn("GridFS: Document " + _id + " has a mime type excluded by the output connector ('" + mimeType + "') - skipping.");
+ errorCode = activities.EXCLUDED_MIMETYPE;
+ errorDesc = "Excluded because of mime type (" + mimeType + ")";
+ activities.noDocument(_id, version);
+ continue;
+ }
+
+ if (!activities.checkDateIndexable(createdDate)) {
+ Logging.connectors.warn("GridFS: Document " + _id + " has a date excluded by the output connector (" + createdDate + ") - skipping.");
+ errorCode = activities.EXCLUDED_DATE;
+ errorDesc = "Excluded because of date (" + createdDate + ")";
+ activities.noDocument(_id, version);
+ continue;
+ }
+
+ RepositoryDocument rd = new RepositoryDocument();
+ rd.setCreatedDate(createdDate);
+ rd.setModifiedDate(createdDate);
+ rd.setFileName(fileName);
+ rd.setMimeType(mimeType);
+ String[] aclsArray = null;
+ String[] denyAclsArray = null;
+ if (acl != null) {
+ try {
+ Object aclObject = document.getMetaData().get(acl);
+ if (aclObject != null) {
+ List<String> acls = (List<String>) aclObject;
+ aclsArray = (String[]) acls.toArray();
+ }
+ } catch (ClassCastException e) {
+ // This is bad because security will fail
+ Logging.connectors.warn("GridFS: Document " + _id + " metadata ACL field doesn't contain List<String> type.");
+ errorCode = "ACLTYPE";
+ errorDesc = "Allow ACL field doesn't contain List<String> type.";
+ throw new ManifoldCFException("Security decoding error: " + e.getMessage(), e);
}
- } catch (ClassCastException e) {
- // This is bad because security will fail
- Logging.connectors.warn("GridFS: Document " + _id + " metadata ACL field doesn't contain List<String> type.");
- throw new ManifoldCFException("Security decoding error: "+e.getMessage(),e);
}
- }
- if (denyAcl != null) {
- try {
- Object denyAclObject = document.getMetaData().get(denyAcl);
- if (denyAclObject != null) {
- List<String> denyAcls = (List<String>) denyAclObject;
- denyAcls.add(GLOBAL_DENY_TOKEN);
- denyAclsArray = (String[]) denyAcls.toArray();
+ if (denyAcl != null) {
+ try {
+ Object denyAclObject = document.getMetaData().get(denyAcl);
+ if (denyAclObject != null) {
+ List<String> denyAcls = (List<String>) denyAclObject;
+ denyAcls.add(GLOBAL_DENY_TOKEN);
+ denyAclsArray = (String[]) denyAcls.toArray();
+ }
+ } catch (ClassCastException e) {
+ // This is bad because security will fail
+ Logging.connectors.warn("GridFS: Document " + _id + " metadata DenyACL field doesn't contain List<String> type.");
+ errorCode = "ACLTYPE";
+ errorDesc = "Deny ACL field doesn't contain List<String> type.";
+ throw new ManifoldCFException("Security decoding error: " + e.getMessage(), e);
}
- } catch (ClassCastException e) {
- // This is bad because security will fail
- Logging.connectors.warn("GridFS: Document " + _id + " metadata DenyACL field doesn't contain List<String> type.");
- throw new ManifoldCFException("Security decoding error: "+e.getMessage(),e);
}
- }
- rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT,aclsArray,denyAclsArray);
+ rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT, aclsArray, denyAclsArray);
- InputStream is = document.getInputStream();
- try {
- rd.setBinary(is, fileLenght);
- try {
- activities.ingestDocumentWithException(_id, version, urlValue, rd);
- } catch (IOException e) {
- handleIOException(e);
- }
- } finally {
+ InputStream is = document.getInputStream();
try {
- is.close();
- } catch (IOException e) {
- handleIOException(e);
+ rd.setBinary(is, fileLenght);
+ try {
+ activities.ingestDocumentWithException(_id, version, urlValue, rd);
+ } catch (IOException e) {
+ handleIOException(e);
+ }
+ } finally {
+ try {
+ is.close();
+ } catch (IOException e) {
+ handleIOException(e);
+ }
}
+ gfs.getDB().getMongo().getConnector().close();
+ session = null;
+ activities.recordActivity(startTime, ACTIVITY_FETCH,
+ fileLenght, _id, "OK", null, null);
+ } else {
+ Logging.connectors.warn("GridFS: Document " + _id + " has a invalid URL: " + urlValue + " - skipping.");
+ errorCode = activities.BAD_URL;
+ errorDesc = "Excluded because document had illegal URL ('" + urlValue + "')";
+ activities.noDocument(_id, version);
}
- gfs.getDB().getMongo().getConnector().close();
- session = null;
- activities.recordActivity(startTime, ACTIVITY_FETCH,
- fileLenght, _id, errorCode, errorDesc, null);
} else {
- Logging.connectors.warn("GridFS: Document " + _id + " has a invalid URL: " + urlValue + " - skipping.");
- activities.noDocument(_id,version);
+ Logging.connectors.warn("GridFS: Document " + _id + " has a null URL - skipping.");
+ errorCode = activities.NULL_URL;
+ errorDesc = "Excluded because document had a null URL.";
+ activities.noDocument(_id, version);
+ }
+ } finally {
+ if (errorCode != null) {
+ activities.recordActivity(startTime, ACTIVITY_FETCH, document.getLength(), _id, errorCode, errorDesc, null);
}
- } else {
- Logging.connectors.warn("GridFS: Document " + _id + " has a null URL - skipping.");
- activities.noDocument(_id,version);
}
-
}
}
-
}
protected static void handleIOException(IOException e) throws ManifoldCFException, ServiceInterruption {
Propchange: manifoldcf/branches/dev_1x/framework/
------------------------------------------------------------------------------
Merged /manifoldcf/trunk/framework:r1634373
Modified: manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java?rev=1635106&r1=1635105&r2=1635106&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java (original)
+++ manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java Wed Oct 29 11:59:38 2014
@@ -40,6 +40,8 @@ public interface IOutputHistoryActivity
public static final String JSON_ERROR = "JSONERROR";
public static final String INDEX_NOT_FOUND = "INDEXNOTFOUND";
public static final String XPATH_EXCEPTION = "XPATHEXCEPTION";
+ public static final String BAD_URL = "BADURL";
+ public static final String NULL_URL = "NULLURL";
/** Record time-stamped information about the activity of the output connector.
*@param startTime is either null or the time since the start of epoch in milliseconds (Jan 1, 1970). Every
* activity has an associated time; the startTime field records when the activity began. A null value
Modified: manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java?rev=1635106&r1=1635105&r2=1635106&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java (original)
+++ manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java Wed Oct 29 11:59:38 2014
@@ -32,7 +32,8 @@ public interface IHistoryActivity
public static final String EXCLUDED_LENGTH = IOutputHistoryActivity.EXCLUDED_LENGTH;
public static final String EXCLUDED_MIMETYPE = IOutputHistoryActivity.EXCLUDED_MIMETYPE;
public static final String EXCLUDED_DATE = IOutputHistoryActivity.EXCLUDED_DATE;
-
+ public static final String BAD_URL = IOutputHistoryActivity.BAD_URL;
+ public static final String NULL_URL = IOutputHistoryActivity.NULL_URL;
/** Record time-stamped information about the activity of the connector.
*@param startTime is either null or the time since the start of epoch in milliseconds (Jan 1, 1970). Every
* activity has an associated time; the startTime field records when the activity began. A null value