You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/10/08 15:32:11 UTC

svn commit: r1630099 - /manifoldcf/branches/CONNECTORS-1067/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java

Author: kwright
Date: Wed Oct  8 13:32:11 2014
New Revision: 1630099

URL: http://svn.apache.org/r1630099
Log:
Update jdbc connector

Modified:
    manifoldcf/branches/CONNECTORS-1067/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java

Modified: manifoldcf/branches/CONNECTORS-1067/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1067/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java?rev=1630099&r1=1630098&r2=1630099&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1067/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java (original)
+++ manifoldcf/branches/CONNECTORS-1067/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java Wed Oct  8 13:32:11 2014
@@ -526,154 +526,167 @@ public class JDBCConnector extends org.a
                   if (o != null)
                     contentType = JDBCConnection.readAsString(o);
                   else
-                    contentType = null;
-                  
-                  if (contentType == null || activities.checkMimeTypeIndexable(contentType))
                   {
                     if (contents instanceof BinaryInput)
-                    {
-                      // An ingestion will take place for this document.
-                      RepositoryDocument rd = new RepositoryDocument();
+                      contentType = "application/octet-stream";
+                    else if (contents instanceof CharacterInput)
+                      contentType = "text/plain; charset=utf-8";
+                    else
+                      contentType = "text/plain";
+                  }
+                  
+                  if (!activities.checkMimeTypeIndexable(contentType))
+                  {
+                    Logging.connectors.debug("JDBC: Document '"+id+"' excluded because of mime type - skipping");
+                    activities.noDocument(id,version);
+                    continue;
+                  }
+                  
+                  if (!activities.checkURLIndexable(url))
+                  {
+                    Logging.connectors.debug("JDBC: Document '"+id+"' excluded because of url - skipping");
+                    activities.noDocument(id,version);
+                    continue;
+                  }
 
-                      // Default content type is application/octet-stream for binary data
-                      if (contentType == null)
-                        rd.setMimeType("application/octet-stream");
-                      else
-                        rd.setMimeType(contentType);
+                  // An ingestion will take place for this document.
+                  RepositoryDocument rd = new RepositoryDocument();
+                  rd.setMimeType(contentType);
                       
-                      applyAccessTokens(rd,ts);
-                      applyMetadata(rd,row);
+                  applyAccessTokens(rd,ts);
+                  applyMetadata(rd,row);
+
+                  if (contents instanceof BinaryInput)
+                  {
+
+                    BinaryInput bi = (BinaryInput)contents;
+                    long fileLength = bi.getLength();
+                    
+                    if (!activities.checkLengthIndexable(fileLength))
+                    {
+                      Logging.connectors.debug("JDBC: Document '"+id+"' excluded because of length - skipping");
+                      activities.noDocument(id, version);
+                      continue;
+                    }
 
-                      BinaryInput bi = (BinaryInput)contents;
+                    try
+                    {
+                      // Read the stream
+                      InputStream is = bi.getStream();
                       try
                       {
-                        // Read the stream
-                        InputStream is = bi.getStream();
-                        try
-                        {
-                          rd.setBinary(is,bi.getLength());
-                          activities.ingestDocumentWithException(id, version, url, rd);
-                        }
-                        finally
-                        {
-                          is.close();
-                        }
+                        rd.setBinary(is,fileLength);
+                        activities.ingestDocumentWithException(id, version, url, rd);
                       }
-                      catch (java.net.SocketTimeoutException e)
+                      finally
                       {
-                        throw new ManifoldCFException("Socket timeout reading database data: "+e.getMessage(),e);
+                        is.close();
                       }
-                      catch (InterruptedIOException e)
+                    }
+                    catch (java.net.SocketTimeoutException e)
+                    {
+                      throw new ManifoldCFException("Socket timeout reading database data: "+e.getMessage(),e);
+                    }
+                    catch (InterruptedIOException e)
+                    {
+                      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+                    }
+                    catch (IOException e)
+                    {
+                      throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
+                    }
+                  }
+                  else if (contents instanceof CharacterInput)
+                  {
+                    CharacterInput ci = (CharacterInput)contents;
+                    long fileLength = ci.getUtf8StreamLength();
+                    
+                    if (!activities.checkLengthIndexable(fileLength))
+                    {
+                      Logging.connectors.debug("JDBC: Document '"+id+"' excluded because of length - skipping");
+                      activities.noDocument(id, version);
+                      continue;
+                    }
+                    
+                    try
+                    {
+                      // Read the stream
+                      InputStream is = ci.getUtf8Stream();
+                      try
                       {
-                        throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+                        rd.setBinary(is,fileLength);
+                        activities.ingestDocumentWithException(id, version, url, rd);
                       }
-                      catch (IOException e)
+                      finally
                       {
-                        throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
+                        is.close();
                       }
                     }
-                    else if (contents instanceof CharacterInput)
+                    catch (java.net.SocketTimeoutException e)
+                    {
+                      throw new ManifoldCFException("Socket timeout reading database data: "+e.getMessage(),e);
+                    }
+                    catch (InterruptedIOException e)
+                    {
+                      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+                    }
+                    catch (IOException e)
                     {
-                      // An ingestion will take place for this document.
-                      RepositoryDocument rd = new RepositoryDocument();
+                      throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
+                    }
+                  }
+                  else
+                  {
+                    // Turn it into a string, and then into a stream
+                    String value = contents.toString();
+                    byte[] bytes = value.getBytes(StandardCharsets.UTF_8);
+                    long fileLength = bytes.length;
 
-                      // Default content type is application/octet-stream for binary data
-                      if (contentType == null)
-                        rd.setMimeType("text/plain; charset=utf-8");
-                      else
-                        rd.setMimeType(contentType);
-                      
-                      applyAccessTokens(rd,ts);
-                      applyMetadata(rd,row);
+                    if (!activities.checkLengthIndexable(fileLength))
+                    {
+                      Logging.connectors.debug("JDBC: Document '"+id+"' excluded because of length - skipping");
+                      activities.noDocument(id, version);
+                      continue;
+                    }
 
-                      CharacterInput ci = (CharacterInput)contents;
+                    try
+                    {
+                      InputStream is = new ByteArrayInputStream(bytes);
                       try
                       {
-                        // Read the stream
-                        InputStream is = ci.getUtf8Stream();
-                        try
-                        {
-                          rd.setBinary(is,ci.getUtf8StreamLength());
-                          activities.ingestDocumentWithException(id, version, url, rd);
-                        }
-                        finally
-                        {
-                          is.close();
-                        }
-                      }
-                      catch (java.net.SocketTimeoutException e)
-                      {
-                        throw new ManifoldCFException("Socket timeout reading database data: "+e.getMessage(),e);
-                      }
-                      catch (InterruptedIOException e)
-                      {
-                        throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+                        rd.setBinary(is,fileLength);
+                        activities.ingestDocumentWithException(id, version, url, rd);
                       }
-                      catch (IOException e)
+                      finally
                       {
-                        throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
+                        is.close();
                       }
                     }
-                    else
+                    catch (InterruptedIOException e)
                     {
-                      // Turn it into a string, and then into a stream
-                      String value = contents.toString();
-                      try
-                      {
-                        byte[] bytes = value.getBytes(StandardCharsets.UTF_8);
-                        RepositoryDocument rd = new RepositoryDocument();
-
-                        // Default content type is text/plain for character data
-                        if (contentType == null)
-                          rd.setMimeType("text/plain");
-                        else
-                          rd.setMimeType(contentType);
-                        
-                        applyAccessTokens(rd,ts);
-                        applyMetadata(rd,row);
-
-                        InputStream is = new ByteArrayInputStream(bytes);
-                        try
-                        {
-                          rd.setBinary(is,bytes.length);
-                          activities.ingestDocumentWithException(id, version, url, rd);
-                        }
-                        finally
-                        {
-                          is.close();
-                        }
-                      }
-                      catch (InterruptedIOException e)
-                      {
-                        throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
-                      }
-                      catch (IOException e)
-                      {
-                        throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
-                      }
+                      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+                    }
+                    catch (IOException e)
+                    {
+                      throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
                     }
-                  }
-                  else
-                  {
-                    Logging.connectors.warn("JDBC: Document '"+id+"' excluded because of mime type - skipping");
-                    activities.noDocument(id,version);
                   }
                 }
                 else
                 {
-                  Logging.connectors.warn("JDBC: Document '"+id+"' seems to have null data - skipping");
+                  Logging.connectors.debug("JDBC: Document '"+id+"' seems to have null data - skipping");
                   activities.noDocument(id,version);
                 }
               }
               else
               {
-                Logging.connectors.warn("JDBC: Document '"+id+"' has an illegal url: '"+url+"' - skipping");
+                Logging.connectors.debug("JDBC: Document '"+id+"' has an illegal url: '"+url+"' - skipping");
                 activities.noDocument(id,version);
               }
             }
             else
             {
-              Logging.connectors.warn("JDBC: Document '"+id+"' has a null url - skipping");
+              Logging.connectors.debug("JDBC: Document '"+id+"' has a null url - skipping");
               activities.noDocument(id,version);
             }
           }