You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/10/08 19:54:48 UTC
svn commit: r1630188 [1/2] - in /manifoldcf/trunk: ./
connectors/alfresco-webscript/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/
connectors/alfresco-webscript/connector/src/test/java/org/apache/manifoldcf/crawler/...
Author: kwright
Date: Wed Oct 8 17:54:47 2014
New Revision: 1630188
URL: http://svn.apache.org/r1630188
Log:
Implement CONNECTORS-1067.
Modified:
manifoldcf/trunk/ (props changed)
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/alfresco-webscript/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/AlfrescoConnector.java
manifoldcf/trunk/connectors/alfresco-webscript/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/AlfrescoConnectorTest.java
manifoldcf/trunk/connectors/cmis/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/cmis/CmisRepositoryConnector.java
manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
manifoldcf/trunk/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java
manifoldcf/trunk/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java
manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java
manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
manifoldcf/trunk/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java
manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java
manifoldcf/trunk/connectors/sharepoint/ (props changed)
manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
manifoldcf/trunk/connectors/wiki/ (props changed)
manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java
manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java
manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputCheckActivity.java
manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IPipelineConnector.java
manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/output/BaseOutputConnector.java
manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/transformation/BaseTransformationConnector.java
manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IFingerprintActivity.java
manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
Propchange: manifoldcf/trunk/
------------------------------------------------------------------------------
Merged /manifoldcf/branches/CONNECTORS-1067:r1630049-1630186
Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1630188&r1=1630187&r2=1630188&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Wed Oct 8 17:54:47 2014
@@ -3,6 +3,10 @@ $Id$
======================= 2.0-dev =====================
+CONNECTORS-1067: Allow document filtering on modification date,
+and also hook this up in all repository connectors where it makes sense.
+(Karl Wright)
+
CONNECTORS-1057: Implement full internationalization for alfresco-webscript
connector.
(Karl Wright)
Modified: manifoldcf/trunk/connectors/alfresco-webscript/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/AlfrescoConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/alfresco-webscript/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/AlfrescoConnector.java?rev=1630188&r1=1630187&r2=1630188&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/alfresco-webscript/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/AlfrescoConnector.java (original)
+++ manifoldcf/trunk/connectors/alfresco-webscript/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/AlfrescoConnector.java Wed Oct 8 17:54:47 2014
@@ -271,11 +271,16 @@ public class AlfrescoConnector extends B
continue;
}
- if (mimeType != null && !activities.checkMimeTypeIndexable(mimeType)) {
+ if (!activities.checkMimeTypeIndexable(mimeType)) {
activities.noDocument(doc, documentVersion);
continue;
}
+ if (!activities.checkDateIndexable(modifiedDate)) {
+ activities.noDocument(doc, documentVersion);
+ continue;
+ }
+
RepositoryDocument rd = new RepositoryDocument();
rd.addField(FIELD_NODEREF, nodeRef);
rd.addField(FIELD_TYPE, type);
Modified: manifoldcf/trunk/connectors/alfresco-webscript/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/AlfrescoConnectorTest.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/alfresco-webscript/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/AlfrescoConnectorTest.java?rev=1630188&r1=1630187&r2=1630188&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/alfresco-webscript/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/AlfrescoConnectorTest.java (original)
+++ manifoldcf/trunk/connectors/alfresco-webscript/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/AlfrescoConnectorTest.java Wed Oct 8 17:54:47 2014
@@ -20,6 +20,7 @@ import static org.mockito.Matchers.any;
import static org.mockito.Matchers.anyInt;
import static org.mockito.Matchers.anyLong;
import static org.mockito.Matchers.anyString;
+import static org.mockito.Matchers.anyObject;
import static org.mockito.Matchers.eq;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.never;
@@ -32,6 +33,7 @@ import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
+import java.util.Date;
import org.alfresco.consulting.indexer.client.AlfrescoClient;
import org.alfresco.consulting.indexer.client.AlfrescoFilters;
@@ -128,6 +130,8 @@ public class AlfrescoConnectorTest {
.thenReturn(true);
when(activities.checkMimeTypeIndexable(anyString()))
.thenReturn(true);
+ when(activities.checkDateIndexable((Date)anyObject()))
+ .thenReturn(true);
IExistingVersions statuses = mock(IExistingVersions.class);
when(client.fetchNode(anyString()))
@@ -152,6 +156,8 @@ public class AlfrescoConnectorTest {
verify(activities)
.checkMimeTypeIndexable(eq("text/plain"));
verify(activities)
+ .checkDateIndexable(eq(org.apache.manifoldcf.core.common.DateParser.parseISO8601Date((String)testDocument.get("cm:modified"))));
+ verify(activities)
.ingestDocumentWithException(eq(TestDocument.uuid), anyString(),
eq((String)testDocument.get("contentUrlPath")), rd.capture());
Modified: manifoldcf/trunk/connectors/cmis/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/cmis/CmisRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/cmis/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/cmis/CmisRepositoryConnector.java?rev=1630188&r1=1630187&r2=1630188&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/cmis/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/cmis/CmisRepositoryConnector.java (original)
+++ manifoldcf/trunk/connectors/cmis/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/cmis/CmisRepositoryConnector.java Wed Oct 8 17:54:47 2014
@@ -1143,171 +1143,198 @@ public class CmisRepositoryConnector ext
activities.addDocumentReference(child.getId(), documentIdentifier,
RELATIONSHIP_CHILD);
}
- } else if(baseTypeId.equals(CMIS_DOCUMENT_BASE_TYPE)){
- // content ingestion
+ } else if(baseTypeId.equals(CMIS_DOCUMENT_BASE_TYPE)) {
+ // content ingestion
- Document document = (Document) cmisObject;
- long fileLength;
- InputStream is;
- try {
- fileLength = document.getContentStreamLength();
- if (fileLength > 0)
- is = document.getContentStream().getStream();
- else
- is = null;
- } catch (CmisObjectNotFoundException e) {
- // Document gone
- activities.deleteDocument(documentIdentifier);
- continue;
- }
+ Document document = (Document) cmisObject;
- try {
- RepositoryDocument rd = new RepositoryDocument();
Date createdDate = document.getCreationDate().getTime();
Date modifiedDate = document.getLastModificationDate().getTime();
-
- rd.setFileName(document.getContentStreamFileName());
- rd.setMimeType(document.getContentStreamMimeType());
+ long fileLength = document.getContentStreamLength();
+ String fileName = document.getContentStreamFileName();
+ String mimeType = document.getContentStreamMimeType();
+ //documentURI
+ String documentURI = CmisRepositoryConnectorUtils.getDocumentURL(document, session);
+
+ // Do any filtering (which will save us work)
+ if (!activities.checkURLIndexable(documentURI))
+ {
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
+
+ if (!activities.checkMimeTypeIndexable(mimeType))
+ {
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
+
+ if (!activities.checkLengthIndexable(fileLength))
+ {
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
+
+ if (!activities.checkDateIndexable(modifiedDate))
+ {
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
+
+ RepositoryDocument rd = new RepositoryDocument();
+ rd.setFileName(fileName);
+ rd.setMimeType(mimeType);
rd.setCreatedDate(createdDate);
rd.setModifiedDate(modifiedDate);
-
- //binary
- if(is != null) {
- rd.setBinary(is, fileLength);
- } else {
- rd.setBinary(new NullInputStream(0),0);
+
+ InputStream is;
+ try {
+ if (fileLength > 0)
+ is = document.getContentStream().getStream();
+ else
+ is = null;
+ } catch (CmisObjectNotFoundException e) {
+ // Document gone
+ activities.deleteDocument(documentIdentifier);
+ continue;
}
+
+ try {
+ //binary
+ if(is != null) {
+ rd.setBinary(is, fileLength);
+ } else {
+ rd.setBinary(new NullInputStream(0),0);
+ }
- //properties
- List<Property<?>> properties = document.getProperties();
- String id = StringUtils.EMPTY;
- for (Property<?> property : properties) {
- String propertyId = property.getId();
-
- if(CmisRepositoryConnectorUtils.existsInSelectClause(cmisQuery, propertyId)){
+ //properties
+ List<Property<?>> properties = document.getProperties();
+ String id = StringUtils.EMPTY;
+ for (Property<?> property : properties) {
+ String propertyId = property.getId();
- if (propertyId.endsWith(Constants.PARAM_OBJECT_ID)) {
- id = (String) property.getValue();
-
- if (property.getValue() !=null
- || property.getValues() != null) {
- PropertyType propertyType = property.getType();
-
- switch (propertyType) {
-
- case STRING:
- case ID:
- case URI:
- case HTML:
- if(property.isMultiValued()){
- List<String> htmlPropertyValues = (List<String>) property.getValues();
- for (String htmlPropertyValue : htmlPropertyValues) {
- rd.addField(propertyId, htmlPropertyValue);
- }
- } else {
- String stringValue = (String) property.getValue();
- if(StringUtils.isNotEmpty(stringValue)){
- rd.addField(propertyId, stringValue);
- }
- }
- break;
-
- case BOOLEAN:
- if(property.isMultiValued()){
- List<Boolean> booleanPropertyValues = (List<Boolean>) property.getValues();
- for (Boolean booleanPropertyValue : booleanPropertyValues) {
- rd.addField(propertyId, booleanPropertyValue.toString());
- }
- } else {
- Boolean booleanValue = (Boolean) property.getValue();
- if(booleanValue!=null){
- rd.addField(propertyId, booleanValue.toString());
- }
- }
- break;
-
- case INTEGER:
- if(property.isMultiValued()){
- List<BigInteger> integerPropertyValues = (List<BigInteger>) property.getValues();
- for (BigInteger integerPropertyValue : integerPropertyValues) {
- rd.addField(propertyId, integerPropertyValue.toString());
- }
- } else {
- BigInteger integerValue = (BigInteger) property.getValue();
- if(integerValue!=null){
- rd.addField(propertyId, integerValue.toString());
- }
- }
- break;
-
- case DECIMAL:
- if(property.isMultiValued()){
- List<BigDecimal> decimalPropertyValues = (List<BigDecimal>) property.getValues();
- for (BigDecimal decimalPropertyValue : decimalPropertyValues) {
- rd.addField(propertyId, decimalPropertyValue.toString());
- }
- } else {
- BigDecimal decimalValue = (BigDecimal) property.getValue();
- if(decimalValue!=null){
- rd.addField(propertyId, decimalValue.toString());
- }
- }
- break;
+ if(CmisRepositoryConnectorUtils.existsInSelectClause(cmisQuery, propertyId)){
+
+ if (propertyId.endsWith(Constants.PARAM_OBJECT_ID)) {
+ id = (String) property.getValue();
- case DATETIME:
- if(property.isMultiValued()){
- List<GregorianCalendar> datePropertyValues = (List<GregorianCalendar>) property.getValues();
- for (GregorianCalendar datePropertyValue : datePropertyValues) {
- rd.addField(propertyId,
- ISO8601_DATE_FORMATTER.format(datePropertyValue.getTime()));
- }
- } else {
- GregorianCalendar dateValue = (GregorianCalendar) property.getValue();
- if(dateValue!=null){
- rd.addField(propertyId, ISO8601_DATE_FORMATTER.format(dateValue.getTime()));
- }
+ if (property.getValue() !=null
+ || property.getValues() != null) {
+ PropertyType propertyType = property.getType();
+
+ switch (propertyType) {
+
+ case STRING:
+ case ID:
+ case URI:
+ case HTML:
+ if(property.isMultiValued()){
+ List<String> htmlPropertyValues = (List<String>) property.getValues();
+ for (String htmlPropertyValue : htmlPropertyValues) {
+ rd.addField(propertyId, htmlPropertyValue);
+ }
+ } else {
+ String stringValue = (String) property.getValue();
+ if(StringUtils.isNotEmpty(stringValue)){
+ rd.addField(propertyId, stringValue);
+ }
+ }
+ break;
+
+ case BOOLEAN:
+ if(property.isMultiValued()){
+ List<Boolean> booleanPropertyValues = (List<Boolean>) property.getValues();
+ for (Boolean booleanPropertyValue : booleanPropertyValues) {
+ rd.addField(propertyId, booleanPropertyValue.toString());
+ }
+ } else {
+ Boolean booleanValue = (Boolean) property.getValue();
+ if(booleanValue!=null){
+ rd.addField(propertyId, booleanValue.toString());
+ }
+ }
+ break;
+
+ case INTEGER:
+ if(property.isMultiValued()){
+ List<BigInteger> integerPropertyValues = (List<BigInteger>) property.getValues();
+ for (BigInteger integerPropertyValue : integerPropertyValues) {
+ rd.addField(propertyId, integerPropertyValue.toString());
+ }
+ } else {
+ BigInteger integerValue = (BigInteger) property.getValue();
+ if(integerValue!=null){
+ rd.addField(propertyId, integerValue.toString());
+ }
+ }
+ break;
+
+ case DECIMAL:
+ if(property.isMultiValued()){
+ List<BigDecimal> decimalPropertyValues = (List<BigDecimal>) property.getValues();
+ for (BigDecimal decimalPropertyValue : decimalPropertyValues) {
+ rd.addField(propertyId, decimalPropertyValue.toString());
+ }
+ } else {
+ BigDecimal decimalValue = (BigDecimal) property.getValue();
+ if(decimalValue!=null){
+ rd.addField(propertyId, decimalValue.toString());
+ }
+ }
+ break;
+
+ case DATETIME:
+ if(property.isMultiValued()){
+ List<GregorianCalendar> datePropertyValues = (List<GregorianCalendar>) property.getValues();
+ for (GregorianCalendar datePropertyValue : datePropertyValues) {
+ rd.addField(propertyId,
+ ISO8601_DATE_FORMATTER.format(datePropertyValue.getTime()));
+ }
+ } else {
+ GregorianCalendar dateValue = (GregorianCalendar) property.getValue();
+ if(dateValue!=null){
+ rd.addField(propertyId, ISO8601_DATE_FORMATTER.format(dateValue.getTime()));
+ }
+ }
+ break;
+
+ default:
+ break;
}
- break;
-
- default:
- break;
}
+
}
-
+
}
-
}
- }
-
- //ingestion
- //documentURI
- String documentURI = CmisRepositoryConnectorUtils.getDocumentURL(document, session);
-
- try {
- activities.ingestDocumentWithException(documentIdentifier, versionString, documentURI, rd);
- } catch (IOException e) {
- errorCode = "IO ERROR";
- errorDesc = e.getMessage();
- handleIOException(e, "reading file input stream");
- }
- } finally {
- try {
- if(is!=null){
- is.close();
+ //ingestion
+
+
+ try {
+ activities.ingestDocumentWithException(documentIdentifier, versionString, documentURI, rd);
+ } catch (IOException e) {
+ errorCode = "IO ERROR";
+ errorDesc = e.getMessage();
+ handleIOException(e, "reading file input stream");
}
- } catch (IOException e) {
- errorCode = "IO ERROR";
- errorDesc = e.getMessage();
- handleIOException(e, "closing file input stream");
} finally {
- activities.recordActivity(new Long(startTime), ACTIVITY_READ,
- fileLength, documentIdentifier, errorCode, errorDesc, null);
+ try {
+ if(is!=null){
+ is.close();
+ }
+ } catch (IOException e) {
+ errorCode = "IO ERROR";
+ errorDesc = e.getMessage();
+ handleIOException(e, "closing file input stream");
+ } finally {
+ activities.recordActivity(new Long(startTime), ACTIVITY_READ,
+ fileLength, documentIdentifier, errorCode, errorDesc, null);
+ }
}
}
- }
- else
- activities.deleteDocument(documentIdentifier);
+ else
+ activities.noDocument(documentIdentifier,versionString);
}
}
Modified: manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java?rev=1630188&r1=1630187&r2=1630188&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java (original)
+++ manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java Wed Oct 8 17:54:47 2014
@@ -330,6 +330,45 @@ public class FileConnector extends org.a
// We still need to check based on file data.
if (checkIngest(file,spec))
{
+ String fileName = file.getName();
+ Date modifiedDate = new Date(file.lastModified());
+ String mimeType = mapExtensionToMimeType(fileName);
+ String uri;
+ if (convertPath != null) {
+ // WGET-compatible input; convert back to external URI
+ uri = convertToWGETURI(convertPath);
+ } else {
+ uri = convertToURI(documentIdentifier);
+ }
+
+ if (!activities.checkLengthIndexable(fileLength))
+ {
+ Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because length was excluded by output connector.");
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
+
+ if (!activities.checkURLIndexable(uri))
+ {
+ Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because URL was excluded by output connector.");
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
+
+ if (!activities.checkDateIndexable(modifiedDate))
+ {
+ Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because date ("+modifiedDate+") was excluded by output connector.");
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
+
+ if (!activities.checkMimeTypeIndexable(mimeType))
+ {
+ Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because mime type ('"+mimeType+"') was excluded by output connector.");
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
+
long startTime = System.currentTimeMillis();
String errorCode = "OK";
String errorDesc = null;
@@ -345,17 +384,13 @@ public class FileConnector extends org.a
{
RepositoryDocument data = new RepositoryDocument();
data.setBinary(is,fileLength);
- String fileName = file.getName();
data.setFileName(fileName);
- data.setMimeType(mapExtensionToMimeType(fileName));
- data.setModifiedDate(new Date(file.lastModified()));
- String uri;
+ data.setMimeType(mimeType);
+ data.setModifiedDate(modifiedDate);
if (convertPath != null) {
// WGET-compatible input; convert back to external URI
- uri = convertToWGETURI(convertPath);
data.addField("uri",uri);
} else {
- uri = convertToURI(documentIdentifier);
data.addField("uri",file.toString());
}
// MHL for other metadata
Modified: manifoldcf/trunk/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java?rev=1630188&r1=1630187&r2=1630188&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java (original)
+++ manifoldcf/trunk/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java Wed Oct 8 17:54:47 2014
@@ -1090,18 +1090,51 @@ public class GoogleDriveRepositoryConnec
Logging.connectors.debug("GOOGLEDRIVE: its a file");
}
- // We always direct to the PDF except for Spreadsheets
- String documentURI = null;
- if (!googleFile.getMimeType().equals("application/vnd.google-apps.spreadsheet")) {
- documentURI = getUrl(googleFile, "application/pdf");
- } else {
- documentURI = getUrl(googleFile, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
- }
-
// Get the file length
- Long fileLength = Objects.firstNonNull(googleFile.getFileSize(), 0L);
- if (fileLength != null) {
+ Long fileLengthLong = Objects.firstNonNull(googleFile.getFileSize(), 0L);
+ if (fileLengthLong != null) {
+ // Now do standard stuff
+ long fileLength = fileLengthLong.longValue();
+ String mimeType = googleFile.getMimeType();
+ DateTime createdDateObject = googleFile.getCreatedDate();
+ DateTime modifiedDateObject = googleFile.getModifiedDate();
+ String extension = googleFile.getFileExtension();
+ String title = googleFile.getTitle();
+ Date createdDate = (createdDateObject==null)?null:new Date(createdDateObject.getValue());
+ Date modifiedDate = (modifiedDateObject==null)?null:new Date(modifiedDateObject.getValue());
+ // We always direct to the PDF except for Spreadsheets
+ String documentURI = null;
+ if (!mimeType.equals("application/vnd.google-apps.spreadsheet")) {
+ documentURI = getUrl(googleFile, "application/pdf");
+ } else {
+ documentURI = getUrl(googleFile, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+ }
+
+ if (!activities.checkLengthIndexable(fileLength))
+ {
+ activities.noDocument(nodeId,version);
+ continue;
+ }
+
+ if (!activities.checkURLIndexable(documentURI))
+ {
+ activities.noDocument(nodeId,version);
+ continue;
+ }
+
+ if (!activities.checkMimeTypeIndexable(mimeType))
+ {
+ activities.noDocument(nodeId,version);
+ continue;
+ }
+
+ if (!activities.checkDateIndexable(modifiedDate))
+ {
+ activities.noDocument(nodeId,version);
+ continue;
+ }
+
RepositoryDocument rd = new RepositoryDocument();
if (acls != null) {
@@ -1112,19 +1145,12 @@ public class GoogleDriveRepositoryConnec
}
}
- // Now do standard stuff
- String mimeType = googleFile.getMimeType();
- DateTime createdDate = googleFile.getCreatedDate();
- DateTime modifiedDate = googleFile.getModifiedDate();
- String extension = googleFile.getFileExtension();
- String title = googleFile.getTitle();
-
if (mimeType != null)
rd.setMimeType(mimeType);
if (createdDate != null)
- rd.setCreatedDate(new Date(createdDate.getValue()));
+ rd.setCreatedDate(createdDate);
if (modifiedDate != null)
- rd.setModifiedDate(new Date(modifiedDate.getValue()));
+ rd.setModifiedDate(modifiedDate);
if (extension != null)
{
if (title == null)
Modified: manifoldcf/trunk/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java?rev=1630188&r1=1630187&r2=1630188&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java (original)
+++ manifoldcf/trunk/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java Wed Oct 8 17:54:47 2014
@@ -415,8 +415,6 @@ public class GridFSRepositoryConnector e
String errorDesc = null;
String version = versionString;
- RepositoryDocument rd = new RepositoryDocument();
-
if (Logging.connectors.isDebugEnabled()) {
Logging.connectors.debug("GridFS: Processing document _id = " + _id);
}
@@ -441,44 +439,77 @@ public class GridFSRepositoryConnector e
}
if (validURL) {
long fileLenght = document.getLength();
- InputStream is = document.getInputStream();
- try {
- Date indexingDate = new Date();
- rd.setBinary(is, fileLenght);
- rd.setCreatedDate(document.getUploadDate());
- rd.setFileName(document.getFilename());
- rd.setIndexingDate(indexingDate);
- rd.setMimeType(document.getContentType());
- String[] aclsArray = null;
- String[] denyAclsArray = null;
- if (acl != null) {
- try {
- Object aclObject = document.getMetaData().get(acl);
- if (aclObject != null) {
- List<String> acls = (List<String>) aclObject;
- aclsArray = (String[]) acls.toArray();
- }
- } catch (ClassCastException e) {
- // This is bad because security will fail
- Logging.connectors.warn("GridFS: Document " + _id + " metadata ACL field doesn't contain List<String> type.");
- throw new ManifoldCFException("Security decoding error: "+e.getMessage(),e);
+ Date createdDate = document.getUploadDate();
+ String fileName = document.getFilename();
+ String mimeType = document.getContentType();
+
+ if (!activities.checkURLIndexable(urlValue))
+ {
+ Logging.connectors.warn("GridFS: Document " + _id + " has a URL excluded by the output connector ('" + urlValue + "') - skipping.");
+ activities.noDocument(_id, version);
+ continue;
+ }
+
+ if (!activities.checkLengthIndexable(fileLenght))
+ {
+ Logging.connectors.warn("GridFS: Document " + _id + " has a length excluded by the output connector (" + fileLenght + ") - skipping.");
+ activities.noDocument(_id, version);
+ continue;
+ }
+
+ if (!activities.checkMimeTypeIndexable(mimeType))
+ {
+ Logging.connectors.warn("GridFS: Document " + _id + " has a mime type excluded by the output connector ('" + mimeType + "') - skipping.");
+ activities.noDocument(_id, version);
+ continue;
+ }
+
+ if (!activities.checkDateIndexable(createdDate))
+ {
+ Logging.connectors.warn("GridFS: Document " + _id + " has a date excluded by the output connector (" + createdDate + ") - skipping.");
+ activities.noDocument(_id, version);
+ continue;
+ }
+
+ RepositoryDocument rd = new RepositoryDocument();
+ rd.setCreatedDate(createdDate);
+ rd.setModifiedDate(createdDate);
+ rd.setFileName(fileName);
+ rd.setMimeType(mimeType);
+ String[] aclsArray = null;
+ String[] denyAclsArray = null;
+ if (acl != null) {
+ try {
+ Object aclObject = document.getMetaData().get(acl);
+ if (aclObject != null) {
+ List<String> acls = (List<String>) aclObject;
+ aclsArray = (String[]) acls.toArray();
}
+ } catch (ClassCastException e) {
+ // This is bad because security will fail
+ Logging.connectors.warn("GridFS: Document " + _id + " metadata ACL field doesn't contain List<String> type.");
+ throw new ManifoldCFException("Security decoding error: "+e.getMessage(),e);
}
- if (denyAcl != null) {
- try {
- Object denyAclObject = document.getMetaData().get(denyAcl);
- if (denyAclObject != null) {
- List<String> denyAcls = (List<String>) denyAclObject;
- denyAcls.add(GLOBAL_DENY_TOKEN);
- denyAclsArray = (String[]) denyAcls.toArray();
- }
- } catch (ClassCastException e) {
- // This is bad because security will fail
- Logging.connectors.warn("GridFS: Document " + _id + " metadata DenyACL field doesn't contain List<String> type.");
- throw new ManifoldCFException("Security decoding error: "+e.getMessage(),e);
+ }
+ if (denyAcl != null) {
+ try {
+ Object denyAclObject = document.getMetaData().get(denyAcl);
+ if (denyAclObject != null) {
+ List<String> denyAcls = (List<String>) denyAclObject;
+ denyAcls.add(GLOBAL_DENY_TOKEN);
+ denyAclsArray = (String[]) denyAcls.toArray();
}
+ } catch (ClassCastException e) {
+ // This is bad because security will fail
+ Logging.connectors.warn("GridFS: Document " + _id + " metadata DenyACL field doesn't contain List<String> type.");
+ throw new ManifoldCFException("Security decoding error: "+e.getMessage(),e);
}
- rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT,aclsArray,denyAclsArray);
+ }
+ rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT,aclsArray,denyAclsArray);
+
+ InputStream is = document.getInputStream();
+ try {
+ rd.setBinary(is, fileLenght);
try {
activities.ingestDocumentWithException(_id, version, urlValue, rd);
} catch (IOException e) {
Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java?rev=1630188&r1=1630187&r2=1630188&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java Wed Oct 8 17:54:47 2014
@@ -386,27 +386,49 @@ public class HDFSRepositoryConnector ext
continue;
}
+ // It is a file to be indexed.
long fileLength = fileStatus.getLen();
- if (!activities.checkLengthIndexable(fileLength)) {
+ String fileName = fileStatus.getPath().getName();
+ String mimeType = mapExtensionToMimeType(fileStatus.getPath().getName());
+ Date modifiedDate = new Date(fileStatus.getModificationTime());
+ String uri;
+ if (convertPath != null) {
+ uri = convertToWGETURI(convertPath);
+ } else {
+ uri = fileStatus.getPath().toUri().toString();
+ }
+
+ if (!activities.checkLengthIndexable(fileLength))
+ {
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
+
+ if (!activities.checkURLIndexable(uri))
+ {
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
+
+ if (!activities.checkMimeTypeIndexable(mimeType))
+ {
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
+
+ if (!activities.checkDateIndexable(modifiedDate))
+ {
activities.noDocument(documentIdentifier,versionString);
continue;
}
-
- // It is a file to be indexed.
// Prepare the metadata part of RepositoryDocument
RepositoryDocument data = new RepositoryDocument();
- data.setFileName(fileStatus.getPath().getName());
- data.setMimeType(mapExtensionToMimeType(fileStatus.getPath().getName()));
- data.setModifiedDate(new Date(fileStatus.getModificationTime()));
+ data.setFileName(fileName);
+ data.setMimeType(mimeType);
+ data.setModifiedDate(modifiedDate);
- String uri;
- if (convertPath != null) {
- uri = convertToWGETURI(convertPath);
- } else {
- uri = fileStatus.getPath().toUri().toString();
- }
data.addField("uri",uri);
// We will record document fetch as an activity
Modified: manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java?rev=1630188&r1=1630187&r2=1630188&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java (original)
+++ manifoldcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java Wed Oct 8 17:54:47 2014
@@ -778,131 +778,189 @@ public class SharedDriveConnector extend
if (fileName != null && !file.isHidden())
{
String uri = ingestionURI;
+ String fileNameString = file.getName();
+ Date lastModifiedDate = new Date(file.lastModified());
+ Date creationDate = new Date(file.createTime());
+ String contentType = mapExtensionToMimeType(fileNameString);
- if (activities.checkURLIndexable(uri))
+ if (!activities.checkURLIndexable(uri))
{
- // Initialize repository document with common stuff, and find the URI
- RepositoryDocument rd = new RepositoryDocument();
- prepareForIndexing(rd,file,
- shareAllow,shareDeny,
- parentAllow,parentDeny,
- documentAllow,documentDeny,
- pathAttributeName,pathAttributeValue);
-
- // manipulate path to include the DFS alias, not the literal path
- // String newPath = matchPrefix + fileName.substring(matchReplace.length());
- String newPath = fileName;
- if (checkNeedFileData(newPath, spec))
- {
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("JCIFS: Local file data needed for '"+documentIdentifier+"'");
+ Logging.connectors.debug("JCIFS: Skipping file because output connector cannot accept URL ('"+uri+"')");
+ activities.recordActivity(null,ACTIVITY_ACCESS,
+ null,documentIdentifier,"Skip","Output connector refused URL",null);
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
+
+ if (!activities.checkMimeTypeIndexable(contentType))
+ {
+ Logging.connectors.debug("JCIFS: Skipping file because output connector cannot accept content type ('"+contentType+"')");
+ activities.recordActivity(null,ACTIVITY_ACCESS,
+ null,documentIdentifier,"Skip","Output connector refused mime type",null);
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
- // Create a temporary file, and use that for the check and then the ingest
- File tempFile = File.createTempFile("_sdc_",null);
+ if (!activities.checkDateIndexable(lastModifiedDate))
+ {
+ Logging.connectors.debug("JCIFS: Skipping file because output connector cannot accept date ("+lastModifiedDate+")");
+ activities.recordActivity(null,ACTIVITY_ACCESS,
+ null,documentIdentifier,"Skip","Output connector refused date",null);
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
+
+ // Initialize repository document with common stuff, and find the URI
+ RepositoryDocument rd = new RepositoryDocument();
+
+ //If using the lastAccess patched/Google version of jcifs then this can be uncommented
+ //Date lastAccessDate = new Date(file.lastAccess());
+ Integer attributes = file.getAttributes();
+ String shareName = file.getShare();
+
+ rd.setFileName(fileNameString);
+ if (contentType != null)
+ rd.setMimeType(contentType);
+ rd.addField("lastModified", lastModifiedDate.toString());
+ rd.setModifiedDate(lastModifiedDate);
+
+ // Add extra obtainable fields to the field map
+ rd.addField("createdOn", creationDate.toString());
+ rd.setCreatedDate(creationDate);
+
+ //rd.addField("lastAccess", lastModifiedDate.toString());
+ rd.addField("attributes", Integer.toString(attributes));
+ rd.addField("shareName", shareName);
+
+ setDocumentSecurity(rd,shareAllow,shareDeny,parentAllow,parentDeny,documentAllow,documentDeny);
+ setPathMetadata(rd,pathAttributeName,pathAttributeValue);
+
+ // manipulate path to include the DFS alias, not the literal path
+ // String newPath = matchPrefix + fileName.substring(matchReplace.length());
+ String newPath = fileName;
+ if (checkNeedFileData(newPath, spec))
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("JCIFS: Local file data needed for '"+documentIdentifier+"'");
+
+ // Create a temporary file, and use that for the check and then the ingest
+ File tempFile = File.createTempFile("_sdc_",null);
+ try
+ {
+ FileOutputStream os = new FileOutputStream(tempFile);
try
{
- FileOutputStream os = new FileOutputStream(tempFile);
+
+ // Now, make a local copy so we can fingerprint
+ InputStream inputStream = getFileInputStream(file);
try
{
-
- // Now, make a local copy so we can fingerprint
- InputStream inputStream = getFileInputStream(file);
- try
+ // Copy!
+ if (transferBuffer == null)
+ transferBuffer = new byte[65536];
+ while (true)
{
- // Copy!
- if (transferBuffer == null)
- transferBuffer = new byte[65536];
- while (true)
- {
- int amt = inputStream.read(transferBuffer,0,transferBuffer.length);
- if (amt == -1)
- break;
- os.write(transferBuffer,0,amt);
- }
- }
- finally
- {
- inputStream.close();
+ int amt = inputStream.read(transferBuffer,0,transferBuffer.length);
+ if (amt == -1)
+ break;
+ os.write(transferBuffer,0,amt);
}
}
finally
{
- os.close();
+ inputStream.close();
}
+ }
+ finally
+ {
+ os.close();
+ }
- if (checkIngest(tempFile, newPath, spec, activities))
+ if (checkIngest(tempFile, newPath, spec, activities))
+ {
+ long fileLength = tempFile.length();
+ if (!activities.checkLengthIndexable(fileLength))
{
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("JCIFS: Decided to ingest '"+documentIdentifier+"'");
- // OK, do ingestion itself!
- InputStream inputStream = new FileInputStream(tempFile);
- try
- {
- rd.setBinary(inputStream, tempFile.length());
-
- activities.ingestDocumentWithException(documentIdentifier, versionString, uri, rd);
- }
- finally
- {
- inputStream.close();
- }
-
- // I put this record here deliberately for two reasons:
- // (1) the other path includes ingestion time, and
- // (2) if anything fails up to and during ingestion, I want THAT failure record to be written, not this one.
- // So, really, ACTIVITY_ACCESS is a bit more than just fetch for JCIFS...
- activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
- new Long(tempFile.length()),documentIdentifier,"Success",null,null);
+ Logging.connectors.debug("JCIFS: Skipping file because output connector cannot accept length ("+fileLength+")");
+ activities.recordActivity(null,ACTIVITY_ACCESS,
+ null,documentIdentifier,"Skip","Output connector refused length",null);
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("JCIFS: Decided to ingest '"+documentIdentifier+"'");
+ // OK, do ingestion itself!
+ InputStream inputStream = new FileInputStream(tempFile);
+ try
+ {
+ rd.setBinary(inputStream, fileLength);
+
+ activities.ingestDocumentWithException(documentIdentifier, versionString, uri, rd);
}
- else
+ finally
{
- // We must actively remove the document here, because the getDocumentVersions()
- // method has no way of signalling this, since it does not do the fingerprinting.
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("JCIFS: Decided to remove '"+documentIdentifier+"'");
- activities.noDocument(documentIdentifier, versionString);
- // We should record the access here as well, since this is a non-exception way through the code path.
- // (I noticed that this was not being recorded in the history while fixing 25477.)
- activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
- new Long(tempFile.length()),documentIdentifier,"Success",null,null);
+ inputStream.close();
}
- }
- finally
- {
- tempFile.delete();
- }
- }
- else
- {
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("JCIFS: Local file data not needed for '"+documentIdentifier+"'");
- // Presume that since the file was queued that it fulfilled the needed criteria.
- // Go off and ingest the fast way.
+ // I put this record here deliberately for two reasons:
+ // (1) the other path includes ingestion time, and
+ // (2) if anything fails up to and during ingestion, I want THAT failure record to be written, not this one.
+ // So, really, ACTIVITY_ACCESS is a bit more than just fetch for JCIFS...
+ activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
+ new Long(tempFile.length()),documentIdentifier,"Success",null,null);
- // Ingest the document.
- InputStream inputStream = getFileInputStream(file);
- try
- {
- rd.setBinary(inputStream, fileLength(file));
-
- activities.ingestDocumentWithException(documentIdentifier, versionString, uri, rd);
}
- finally
+ else
{
- inputStream.close();
+ // We must actively remove the document here, because the getDocumentVersions()
+ // method has no way of signalling this, since it does not do the fingerprinting.
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("JCIFS: Decided to remove '"+documentIdentifier+"'");
+ activities.noDocument(documentIdentifier, versionString);
+ // We should record the access here as well, since this is a non-exception way through the code path.
+ // (I noticed that this was not being recorded in the history while fixing 25477.)
+ activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
+ new Long(tempFile.length()),documentIdentifier,"Success",null,null);
}
- activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
- new Long(fileLength(file)),documentIdentifier,"Success",null,null);
+ }
+ finally
+ {
+ tempFile.delete();
}
}
else
{
- Logging.connectors.debug("JCIFS: Skipping file because output connector cannot accept it");
- activities.recordActivity(null,ACTIVITY_ACCESS,
- null,documentIdentifier,"Skip","Output connector refused",null);
- activities.noDocument(documentIdentifier,versionString);
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("JCIFS: Local file data not needed for '"+documentIdentifier+"'");
+
+ long fileLength = fileLength(file);
+ if (!activities.checkLengthIndexable(fileLength))
+ {
+ Logging.connectors.debug("JCIFS: Skipping file because output connector cannot accept length ("+fileLength+")");
+ activities.recordActivity(null,ACTIVITY_ACCESS,
+ null,documentIdentifier,"Skip","Output connector refused length",null);
+ activities.noDocument(documentIdentifier,versionString);
+ continue;
+ }
+
+ // Presume that since the file was queued that it fulfilled the needed criteria.
+ // Go off and ingest the fast way.
+
+ // Ingest the document.
+ InputStream inputStream = getFileInputStream(file);
+ try
+ {
+ rd.setBinary(inputStream, fileLength);
+
+ activities.ingestDocumentWithException(documentIdentifier, versionString, uri, rd);
+ }
+ finally
+ {
+ inputStream.close();
+ }
+ activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
+ new Long(fileLength(file)),documentIdentifier,"Success",null,null);
}
}
else
@@ -1032,40 +1090,6 @@ public class SharedDriveConnector extend
}
- protected static void prepareForIndexing(RepositoryDocument rd, SmbFile file,
- String[] shareAllow, String[] shareDeny, String[] parentAllow, String[] parentDeny, String[] allow, String[] deny,
- String pathAttributeName, String pathAttributeValue)
- throws ManifoldCFException, SmbException
- {
- String fileNameString = file.getName();
- Date lastModifiedDate = new Date(file.lastModified());
- Date creationDate = new Date(file.createTime());
- //If using the lastAccess patched/Google version of jcifs then this can be uncommented
- //Date lastAccessDate = new Date(file.lastAccess());
- Integer attributes = file.getAttributes();
- String shareName = file.getShare();
-
-
- String contentType = mapExtensionToMimeType(fileNameString);
-
- rd.setFileName(fileNameString);
- if (contentType != null)
- rd.setMimeType(contentType);
- rd.addField("lastModified", lastModifiedDate.toString());
- rd.setModifiedDate(lastModifiedDate);
-
- // Add extra obtainable fields to the field map
- rd.addField("createdOn", creationDate.toString());
- rd.setCreatedDate(creationDate);
-
- //rd.addField("lastAccess", lastModifiedDate.toString());
- rd.addField("attributes", Integer.toString(attributes));
- rd.addField("shareName", shareName);
-
- setDocumentSecurity(rd,shareAllow,shareDeny,parentAllow,parentDeny,allow,deny);
- setPathMetadata(rd,pathAttributeName,pathAttributeValue);
- }
-
/** Map an extension to a mime type */
protected static String mapExtensionToMimeType(String fileName)
{
Modified: manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java?rev=1630188&r1=1630187&r2=1630188&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java (original)
+++ manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java Wed Oct 8 17:54:47 2014
@@ -526,154 +526,167 @@ public class JDBCConnector extends org.a
if (o != null)
contentType = JDBCConnection.readAsString(o);
else
- contentType = null;
-
- if (contentType == null || activities.checkMimeTypeIndexable(contentType))
{
if (contents instanceof BinaryInput)
- {
- // An ingestion will take place for this document.
- RepositoryDocument rd = new RepositoryDocument();
+ contentType = "application/octet-stream";
+ else if (contents instanceof CharacterInput)
+ contentType = "text/plain; charset=utf-8";
+ else
+ contentType = "text/plain";
+ }
+
+ if (!activities.checkMimeTypeIndexable(contentType))
+ {
+ Logging.connectors.debug("JDBC: Document '"+id+"' excluded because of mime type - skipping");
+ activities.noDocument(id,version);
+ continue;
+ }
+
+ if (!activities.checkURLIndexable(url))
+ {
+ Logging.connectors.debug("JDBC: Document '"+id+"' excluded because of url - skipping");
+ activities.noDocument(id,version);
+ continue;
+ }
- // Default content type is application/octet-stream for binary data
- if (contentType == null)
- rd.setMimeType("application/octet-stream");
- else
- rd.setMimeType(contentType);
+ // An ingestion will take place for this document.
+ RepositoryDocument rd = new RepositoryDocument();
+ rd.setMimeType(contentType);
- applyAccessTokens(rd,ts);
- applyMetadata(rd,row);
+ applyAccessTokens(rd,ts);
+ applyMetadata(rd,row);
+
+ if (contents instanceof BinaryInput)
+ {
+
+ BinaryInput bi = (BinaryInput)contents;
+ long fileLength = bi.getLength();
+
+ if (!activities.checkLengthIndexable(fileLength))
+ {
+ Logging.connectors.debug("JDBC: Document '"+id+"' excluded because of length - skipping");
+ activities.noDocument(id, version);
+ continue;
+ }
- BinaryInput bi = (BinaryInput)contents;
+ try
+ {
+ // Read the stream
+ InputStream is = bi.getStream();
try
{
- // Read the stream
- InputStream is = bi.getStream();
- try
- {
- rd.setBinary(is,bi.getLength());
- activities.ingestDocumentWithException(id, version, url, rd);
- }
- finally
- {
- is.close();
- }
+ rd.setBinary(is,fileLength);
+ activities.ingestDocumentWithException(id, version, url, rd);
}
- catch (java.net.SocketTimeoutException e)
+ finally
{
- throw new ManifoldCFException("Socket timeout reading database data: "+e.getMessage(),e);
+ is.close();
}
- catch (InterruptedIOException e)
+ }
+ catch (java.net.SocketTimeoutException e)
+ {
+ throw new ManifoldCFException("Socket timeout reading database data: "+e.getMessage(),e);
+ }
+ catch (InterruptedIOException e)
+ {
+ throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ }
+ catch (IOException e)
+ {
+ throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
+ }
+ }
+ else if (contents instanceof CharacterInput)
+ {
+ CharacterInput ci = (CharacterInput)contents;
+ long fileLength = ci.getUtf8StreamLength();
+
+ if (!activities.checkLengthIndexable(fileLength))
+ {
+ Logging.connectors.debug("JDBC: Document '"+id+"' excluded because of length - skipping");
+ activities.noDocument(id, version);
+ continue;
+ }
+
+ try
+ {
+ // Read the stream
+ InputStream is = ci.getUtf8Stream();
+ try
{
- throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ rd.setBinary(is,fileLength);
+ activities.ingestDocumentWithException(id, version, url, rd);
}
- catch (IOException e)
+ finally
{
- throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
+ is.close();
}
}
- else if (contents instanceof CharacterInput)
+ catch (java.net.SocketTimeoutException e)
+ {
+ throw new ManifoldCFException("Socket timeout reading database data: "+e.getMessage(),e);
+ }
+ catch (InterruptedIOException e)
+ {
+ throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ }
+ catch (IOException e)
{
- // An ingestion will take place for this document.
- RepositoryDocument rd = new RepositoryDocument();
+ throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
+ }
+ }
+ else
+ {
+ // Turn it into a string, and then into a stream
+ String value = contents.toString();
+ byte[] bytes = value.getBytes(StandardCharsets.UTF_8);
+ long fileLength = bytes.length;
- // Default content type is application/octet-stream for binary data
- if (contentType == null)
- rd.setMimeType("text/plain; charset=utf-8");
- else
- rd.setMimeType(contentType);
-
- applyAccessTokens(rd,ts);
- applyMetadata(rd,row);
+ if (!activities.checkLengthIndexable(fileLength))
+ {
+ Logging.connectors.debug("JDBC: Document '"+id+"' excluded because of length - skipping");
+ activities.noDocument(id, version);
+ continue;
+ }
- CharacterInput ci = (CharacterInput)contents;
+ try
+ {
+ InputStream is = new ByteArrayInputStream(bytes);
try
{
- // Read the stream
- InputStream is = ci.getUtf8Stream();
- try
- {
- rd.setBinary(is,ci.getUtf8StreamLength());
- activities.ingestDocumentWithException(id, version, url, rd);
- }
- finally
- {
- is.close();
- }
- }
- catch (java.net.SocketTimeoutException e)
- {
- throw new ManifoldCFException("Socket timeout reading database data: "+e.getMessage(),e);
- }
- catch (InterruptedIOException e)
- {
- throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ rd.setBinary(is,fileLength);
+ activities.ingestDocumentWithException(id, version, url, rd);
}
- catch (IOException e)
+ finally
{
- throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
+ is.close();
}
}
- else
+ catch (InterruptedIOException e)
{
- // Turn it into a string, and then into a stream
- String value = contents.toString();
- try
- {
- byte[] bytes = value.getBytes(StandardCharsets.UTF_8);
- RepositoryDocument rd = new RepositoryDocument();
-
- // Default content type is text/plain for character data
- if (contentType == null)
- rd.setMimeType("text/plain");
- else
- rd.setMimeType(contentType);
-
- applyAccessTokens(rd,ts);
- applyMetadata(rd,row);
-
- InputStream is = new ByteArrayInputStream(bytes);
- try
- {
- rd.setBinary(is,bytes.length);
- activities.ingestDocumentWithException(id, version, url, rd);
- }
- finally
- {
- is.close();
- }
- }
- catch (InterruptedIOException e)
- {
- throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
- }
- catch (IOException e)
- {
- throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
- }
+ throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ }
+ catch (IOException e)
+ {
+ throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
}
- }
- else
- {
- Logging.connectors.warn("JDBC: Document '"+id+"' excluded because of mime type - skipping");
- activities.noDocument(id,version);
}
}
else
{
- Logging.connectors.warn("JDBC: Document '"+id+"' seems to have null data - skipping");
+ Logging.connectors.debug("JDBC: Document '"+id+"' seems to have null data - skipping");
activities.noDocument(id,version);
}
}
else
{
- Logging.connectors.warn("JDBC: Document '"+id+"' has an illegal url: '"+url+"' - skipping");
+ Logging.connectors.debug("JDBC: Document '"+id+"' has an illegal url: '"+url+"' - skipping");
activities.noDocument(id,version);
}
}
else
{
- Logging.connectors.warn("JDBC: Document '"+id+"' has a null url - skipping");
+ Logging.connectors.debug("JDBC: Document '"+id+"' has a null url - skipping");
activities.noDocument(id,version);
}
}
Modified: manifoldcf/trunk/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java?rev=1630188&r1=1630187&r2=1630188&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java (original)
+++ manifoldcf/trunk/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java Wed Oct 8 17:54:47 2014
@@ -990,6 +990,31 @@ public class JiraRepositoryConnector ext
+ documentIdentifier + "'");
}
+ // Now do standard stuff
+
+ String mimeType = "text/plain";
+ Date createdDate = jiraFile.getCreatedDate();
+ Date modifiedDate = jiraFile.getUpdatedDate();
+ String documentURI = composeDocumentURI(getBaseUrl(session), jiraFile.getKey());
+
+ if (!activities.checkURLIndexable(documentURI))
+ {
+ activities.noDocument(documentIdentifier, versionString);
+ continue;
+ }
+
+ if (!activities.checkMimeTypeIndexable(mimeType))
+ {
+ activities.noDocument(documentIdentifier, versionString);
+ continue;
+ }
+
+ if (!activities.checkDateIndexable(modifiedDate))
+ {
+ activities.noDocument(documentIdentifier, versionString);
+ continue;
+ }
+
//otherwise process
RepositoryDocument rd = new RepositoryDocument();
@@ -1001,12 +1026,6 @@ public class JiraRepositoryConnector ext
denyAclsToUse = new String[0];
rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT,aclsToUse,denyAclsToUse);
- // Now do standard stuff
-
- String mimeType = "text/plain";
- Date createdDate = jiraFile.getCreatedDate();
- Date modifiedDate = jiraFile.getUpdatedDate();
-
rd.setMimeType(mimeType);
if (createdDate != null)
rd.setCreatedDate(createdDate);
@@ -1024,13 +1043,20 @@ public class JiraRepositoryConnector ext
rd.addField(entry.getKey(), entry.getValue());
}
- String documentURI = composeDocumentURI(getBaseUrl(session), jiraFile.getKey());
String document = getJiraBody(jiraFile);
try {
byte[] documentBytes = document.getBytes(StandardCharsets.UTF_8);
+ long fileLength = documentBytes.length;
+
+ if (!activities.checkLengthIndexable(fileLength))
+ {
+ activities.noDocument(documentIdentifier, versionString);
+ continue;
+ }
+
InputStream is = new ByteArrayInputStream(documentBytes);
try {
- rd.setBinary(is, documentBytes.length);
+ rd.setBinary(is, fileLength);
activities.ingestDocumentWithException(documentIdentifier, versionString, documentURI, rd);
// No errors. Record the fact that we made it.
errorCode = "OK";