You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by mo...@apache.org on 2014/10/30 14:20:41 UTC

svn commit: r1635490 - in /manifoldcf/trunk/connectors: googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/ gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/ hdfs/connector/src/main/ja...

Author: molgun
Date: Thu Oct 30 13:20:40 2014
New Revision: 1635490

URL: http://svn.apache.org/r1635490
Log:
CONNECTORS-1077: for HDFS and some modification on GridFS and GoogleDrive

Modified:
    manifoldcf/trunk/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java
    manifoldcf/trunk/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java
    manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java

Modified: manifoldcf/trunk/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java?rev=1635490&r1=1635489&r2=1635490&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java (original)
+++ manifoldcf/trunk/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java Thu Oct 30 13:20:40 2014
@@ -1202,9 +1202,7 @@ public class GoogleDriveRepositoryConnec
 
                 // No errors.  Record the fact that we made it.
                 fileSize = new Long(fileLength);
-                if (doLog){
-                    activities.recordActivity(new Long(startTime), ACTIVITY_READ, fileSize, nodeId, "OK", null, null);
-                }
+                errorCode = "OK";
               } catch (InterruptedException e) {
                 t.interrupt();
                 throw new ManifoldCFException("Interrupted: " + e.getMessage(), e,

Modified: manifoldcf/trunk/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java?rev=1635490&r1=1635489&r2=1635490&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java (original)
+++ manifoldcf/trunk/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java Thu Oct 30 13:20:40 2014
@@ -535,8 +535,7 @@ public class GridFSRepositoryConnector e
                             }
                             gfs.getDB().getMongo().getConnector().close();
                             session = null;
-                            activities.recordActivity(startTime, ACTIVITY_FETCH,
-                                    fileLenght, _id, "OK", null, null);
+                            errorCode = "OK";
                         } else {
                             Logging.connectors.warn("GridFS: Document " + _id + " has a invalid URL: " + urlValue + " - skipping.");
                             errorCode = activities.BAD_URL;

Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java?rev=1635490&r1=1635489&r2=1635490&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java Thu Oct 30 13:20:40 2014
@@ -378,6 +378,11 @@ public class HDFSRepositoryConnector ext
             sb.append("-");
           sb.append(new Long(lastModified).toString());
           versionString = sb.toString();
+          // We will record document fetch as an activity
+          long startTime = System.currentTimeMillis();
+          String errorCode = null;
+          String errorDesc = null;
+          long fileSize = 0;
           
           if (activities.checkDocumentNeedsReindexing(documentIdentifier,versionString)) {
             // Process file!
@@ -391,106 +396,112 @@ public class HDFSRepositoryConnector ext
             String fileName = fileStatus.getPath().getName();
             String mimeType = mapExtensionToMimeType(fileStatus.getPath().getName());
             Date modifiedDate = new Date(fileStatus.getModificationTime());
-            String uri;
-            if (convertPath != null) {
-              uri = convertToWGETURI(convertPath);
-            } else {
-              uri = fileStatus.getPath().toUri().toString();
-            }
+            try {
+                String uri;
+                if (convertPath != null) {
+                    uri = convertToWGETURI(convertPath);
+                } else {
+                    uri = fileStatus.getPath().toUri().toString();
+                }
             
-            if (!activities.checkLengthIndexable(fileLength))
-            {
-              activities.noDocument(documentIdentifier,versionString);
-              continue;
-            }
+                if (!activities.checkLengthIndexable(fileLength))
+                {
+                    errorCode = activities.EXCLUDED_LENGTH;
+                    errorDesc = "Excluding document because of file length ('"+fileLength+"')";
+                    activities.noDocument(documentIdentifier,versionString);
+                    continue;
+                }
             
-            if (!activities.checkURLIndexable(uri))
-            {
-              activities.noDocument(documentIdentifier,versionString);
-              continue;
-            }
+                if (!activities.checkURLIndexable(uri))
+                {
+                    errorCode = activities.EXCLUDED_URL;
+                    errorDesc = "Excluding document because of URL ('"+uri+"')";
+                    activities.noDocument(documentIdentifier,versionString);
+                    continue;
+                }
             
-            if (!activities.checkMimeTypeIndexable(mimeType))
-            {
-              activities.noDocument(documentIdentifier,versionString);
-              continue;
-            }
+                if (!activities.checkMimeTypeIndexable(mimeType))
+                {
+                    errorCode = activities.EXCLUDED_MIMETYPE;
+                    errorDesc = "Excluding document because of mime type ("+mimeType+")";
+                    activities.noDocument(documentIdentifier,versionString);
+                    continue;
+                }
             
-            if (!activities.checkDateIndexable(modifiedDate))
-            {
-              activities.noDocument(documentIdentifier,versionString);
-              continue;
-            }
+                if (!activities.checkDateIndexable(modifiedDate))
+                {
+                    errorCode = activities.EXCLUDED_DATE;
+                    errorDesc = "Excluding document because of date ("+modifiedDate+")";
+                    activities.noDocument(documentIdentifier,versionString);
+                    continue;
+                }
             
-            // Prepare the metadata part of RepositoryDocument
-            RepositoryDocument data = new RepositoryDocument();
+                // Prepare the metadata part of RepositoryDocument
+                RepositoryDocument data = new RepositoryDocument();
 
-            data.setFileName(fileName);
-            data.setMimeType(mimeType);
-            data.setModifiedDate(modifiedDate);
-
-            data.addField("uri",uri);
-
-            // We will record document fetch as an activity
-            long startTime = System.currentTimeMillis();
-            String errorCode = "FAILED";
-            String errorDesc = StringUtils.EMPTY;
-            long fileSize = 0;
+                data.setFileName(fileName);
+                data.setMimeType(mimeType);
+                data.setModifiedDate(modifiedDate);
 
-            try {
-              BackgroundStreamThread t = new BackgroundStreamThread(getSession(),new Path(documentIdentifier));
-              try {
-                t.start();
-                boolean wasInterrupted = false;
+                data.addField("uri",uri);
+
+            
+            
+                BackgroundStreamThread t = new BackgroundStreamThread(getSession(),new Path(documentIdentifier));
                 try {
-                  InputStream is = t.getSafeInputStream();
-                  try {
-                    data.setBinary(is, fileSize);
-                    activities.ingestDocumentWithException(documentIdentifier,versionString,uri,data);
-                  } finally {
-                    is.close();
-                  }
+                    t.start();
+                    boolean wasInterrupted = false;
+                    try {
+                        InputStream is = t.getSafeInputStream();
+                        try {
+                            data.setBinary(is, fileSize);
+                            activities.ingestDocumentWithException(documentIdentifier,versionString,uri,data);
+                        } finally {
+                            is.close();
+                        }
+                    } catch (java.net.SocketTimeoutException e) {
+                        throw e;
+                    } catch (InterruptedIOException e) {
+                        wasInterrupted = true;
+                        throw e;
+                    } catch (ManifoldCFException e) {
+                        if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) {
+                            wasInterrupted = true;
+                        }
+                        throw e;
+                    } finally {
+                        if (!wasInterrupted) {
+                            // This does a join
+                            t.finishUp();
+                        }
+                    }
+
+                    // No errors.  Record the fact that we made it.
+                    errorCode = "OK";
+                    // Length we did in bytes
+                    fileSize = fileStatus.getLen();
+
+                } catch (InterruptedException e) {
+                    // We were interrupted out of the join, most likely.  Before we abandon the thread,
+                    // send a courtesy interrupt.
+                    t.interrupt();
+                    throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
                 } catch (java.net.SocketTimeoutException e) {
-                  throw e;
+                    errorCode = "IOERROR";
+                    errorDesc = e.getMessage();
+                    handleIOException(e);
                 } catch (InterruptedIOException e) {
-                  wasInterrupted = true;
-                  throw e;
-                } catch (ManifoldCFException e) {
-                  if (e.getErrorCode() == ManifoldCFException.INTERRUPTED) {
-                    wasInterrupted = true;
-                  }
-                  throw e;
-                } finally {
-                  if (!wasInterrupted) {
-                    // This does a join
-                    t.finishUp();
-                  }
+                    t.interrupt();
+                    throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
+                } catch (IOException e) {
+                    errorCode = "IOERROR";
+                    errorDesc = e.getMessage();
+                    handleIOException(e);
                 }
-
-                // No errors.  Record the fact that we made it.
-                errorCode = "OK";
-                // Length we did in bytes
-                fileSize = fileStatus.getLen();
-
-              } catch (InterruptedException e) {
-                // We were interrupted out of the join, most likely.  Before we abandon the thread,
-                // send a courtesy interrupt.
-                t.interrupt();
-                throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
-              } catch (java.net.SocketTimeoutException e) {
-                errorCode = "IO ERROR";
-                errorDesc = e.getMessage();
-                handleIOException(e);
-              } catch (InterruptedIOException e) {
-                t.interrupt();
-                throw new ManifoldCFException("Interrupted: " + e.getMessage(), e, ManifoldCFException.INTERRUPTED);
-              } catch (IOException e) {
-                errorCode = "IO ERROR";
-                errorDesc = e.getMessage();
-                handleIOException(e);
-              }
             } finally {
-              activities.recordActivity(new Long(startTime),ACTIVITY_READ,new Long(fileSize),documentIdentifier,errorCode,errorDesc,null);
+                if(errorCode != null){
+                    activities.recordActivity(new Long(startTime),ACTIVITY_READ,new Long(fileSize),documentIdentifier,errorCode,errorDesc,null);
+                }
             }
           }
         }