You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2012/12/18 22:34:10 UTC
svn commit: r1423675 - in /manifoldcf/trunk: CHANGES.txt connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java

Author: kwright
Date: Tue Dec 18 21:34:09 2012
New Revision: 1423675

URL: http://svn.apache.org/viewvc?rev=1423675&view=rev
Log:
Fix for CONNECTORS-539. Parse multiple atom link tags per entry.

Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1423675&r1=1423674&r2=1423675&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Tue Dec 18 21:34:09 2012
@@ -3,6 +3,10 @@ $Id$
 
 ======================= 1.1-dev =====================
 
+CONNECTORS-589: Parse multiple link tags inside an entry tag, for
+Atom feeds.
+(David Morana, Karl Wright)
+
 CONNECTORS-543: Add testing infrastructure for combined war.
 (Karl Wright)
 

Modified: manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java?rev=1423675&r1=1423674&r2=1423675&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java (original)
+++ manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java Tue Dec 18 21:34:09 2012
@@ -4500,7 +4500,7 @@ public class RSSConnector extends org.ap
   protected class FeedItemContextClass extends XMLContext
   {
     protected int dechromedContentMode;
-    protected String linkField = null;
+    protected List<String> linkField = new ArrayList<String>();
     protected String pubDateField = null;
     protected String titleField = null;
     protected ArrayList categoryField = new ArrayList();
@@ -4520,7 +4520,9 @@ public class RSSConnector extends org.ap
       if (qName.equals("link"))
       {
         // "link" tag
-        linkField = atts.getValue("href");
+        String ref = atts.getValue("href");
+        if (ref != null && ref.length() > 0)
+          linkField.add(ref);
         return super.beginTag(namespaceURI,localName,qName,atts);
       }
       else if (qName.equals("published") || qName.equals("updated"))
@@ -4675,101 +4677,104 @@ public class RSSConnector extends org.ap
     public void process(String documentIdentifier, IProcessActivity activities, Filter filter)
       throws ManifoldCFException
     {
-      if (linkField != null && linkField.length() > 0)
+      if (linkField.size() > 0)
       {
         Long origDate = null;
         if (pubDateField != null && pubDateField.length() > 0)
           origDate = parseZuluDate(pubDateField);
 
-        String[] links = linkField.split(", ");
-        int l = 0;
-        while (l < links.length)
+        for (String linkValue : linkField)
         {
-          String rawURL = links[l++].trim();
-          // Process the link
-          String newIdentifier = makeDocumentIdentifier(filter.getCanonicalizationPolicies(),documentIdentifier,rawURL);
-          if (newIdentifier != null)
-          {
-            if (Logging.connectors.isDebugEnabled())
-              Logging.connectors.debug("RSS: In Atom document '"+documentIdentifier+"', found a link to '"+newIdentifier+"', which has origination date "+
-              ((origDate==null)?"null":origDate.toString()));
-            if (filter.isLegalURL(newIdentifier))
+          String[] links = linkValue.split(", ");
+          int l = 0;
+          while (l < links.length)
+          {
+            String rawURL = links[l++].trim();
+            // Process the link
+            String newIdentifier = makeDocumentIdentifier(filter.getCanonicalizationPolicies(),documentIdentifier,rawURL);
+            if (newIdentifier != null)
             {
-              if (contentsFile == null)
-              {
-                // It's a reference!  Add it.
-                String[] dataNames = new String[]{"pubdate","title","source","category","description"};
-                String[][] dataValues = new String[dataNames.length][];
-                if (origDate != null)
-                  dataValues[0] = new String[]{origDate.toString()};
-                if (titleField != null)
-                  dataValues[1] = new String[]{titleField};
-                dataValues[2] = new String[]{documentIdentifier};
-                dataValues[3] = new String[categoryField.size()];
-                int q = 0;
-                while (q < categoryField.size())
-                {
-                  (dataValues[3])[q] = (String)categoryField.get(q);
-                  q++;
-                }
-                if (descriptionField != null)
-                  dataValues[4] = new String[]{descriptionField};
-                  
-                // Add document reference, including the data to pass down
-                activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
-              }
-              else
+              if (Logging.connectors.isDebugEnabled())
+                Logging.connectors.debug("RSS: In Atom document '"+documentIdentifier+"', found a link to '"+newIdentifier+"', which has origination date "+
+                ((origDate==null)?"null":origDate.toString()));
+              if (filter.isLegalURL(newIdentifier))
               {
-                // The issue here is that if a document is ingested without a jobqueue entry, the document will not
-                // be cleaned up if the job is deleted; nor is there any expiration possibility.  So, we really do need to make
-                // sure a jobqueue entry gets created somehow.  Therefore I can't just ingest the document
-                // right here.
-
-                // Now, set up the carrydown info
-                String[] dataNames = new String[]{"pubdate","title","source","category","data","description"};
-                Object[][] dataValues = new Object[dataNames.length][];
-                if (origDate != null)
-                  dataValues[0] = new String[]{origDate.toString()};
-                if (titleField != null)
-                  dataValues[1] = new String[]{titleField};
-                dataValues[2] = new String[]{documentIdentifier};
-                dataValues[3] = new String[categoryField.size()];
-                int q = 0;
-                while (q < categoryField.size())
-                {
-                  (dataValues[3])[q] = (String)categoryField.get(q);
-                  q++;
-                }
-                if (descriptionField != null)
-                  dataValues[5] = new String[]{descriptionField};
-                  
-                CharacterInput ci = new TempFileCharacterInput(contentsFile);
-                try
+                if (contentsFile == null)
                 {
-                  contentsFile = null;
-
-                  dataValues[4] = new Object[]{ci};
-
-                  // Add document reference, including the data to pass down, and the dechromed content too
+                  // It's a reference!  Add it.
+                  String[] dataNames = new String[]{"pubdate","title","source","category","description"};
+                  String[][] dataValues = new String[dataNames.length][];
+                  if (origDate != null)
+                    dataValues[0] = new String[]{origDate.toString()};
+                  if (titleField != null)
+                    dataValues[1] = new String[]{titleField};
+                  dataValues[2] = new String[]{documentIdentifier};
+                  dataValues[3] = new String[categoryField.size()];
+                  int q = 0;
+                  while (q < categoryField.size())
+                  {
+                    (dataValues[3])[q] = (String)categoryField.get(q);
+                    q++;
+                  }
+                  if (descriptionField != null)
+                    dataValues[4] = new String[]{descriptionField};
+                    
+                  // Add document reference, including the data to pass down
                   activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
                 }
-                finally
+                else
                 {
-                  ci.discard();
+                  // The issue here is that if a document is ingested without a jobqueue entry, the document will not
+                  // be cleaned up if the job is deleted; nor is there any expiration possibility.  So, we really do need to make
+                  // sure a jobqueue entry gets created somehow.  Therefore I can't just ingest the document
+                  // right here.
+
+                  // Now, set up the carrydown info
+                  String[] dataNames = new String[]{"pubdate","title","source","category","data","description"};
+                  Object[][] dataValues = new Object[dataNames.length][];
+                  if (origDate != null)
+                    dataValues[0] = new String[]{origDate.toString()};
+                  if (titleField != null)
+                    dataValues[1] = new String[]{titleField};
+                  dataValues[2] = new String[]{documentIdentifier};
+                  dataValues[3] = new String[categoryField.size()];
+                  int q = 0;
+                  while (q < categoryField.size())
+                  {
+                    (dataValues[3])[q] = (String)categoryField.get(q);
+                    q++;
+                  }
+                  if (descriptionField != null)
+                    dataValues[5] = new String[]{descriptionField};
+                    
+                  CharacterInput ci = new TempFileCharacterInput(contentsFile);
+                  try
+                  {
+                    contentsFile = null;
+
+                    dataValues[4] = new Object[]{ci};
+
+                    // Add document reference, including the data to pass down, and the dechromed content too
+                    activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
+                  }
+                  finally
+                  {
+                    ci.discard();
+                  }
                 }
               }
+              else
+              {
+                if (Logging.connectors.isDebugEnabled())
+                  Logging.connectors.debug("RSS: Identifier '"+newIdentifier+"' is excluded");
+              }
             }
             else
             {
               if (Logging.connectors.isDebugEnabled())
-                Logging.connectors.debug("RSS: Identifier '"+newIdentifier+"' is excluded");
+                Logging.connectors.debug("RSS: In Atom document '"+documentIdentifier+"', found an unincluded URL '"+rawURL+"'");
             }
           }
-          else
-          {
-            if (Logging.connectors.isDebugEnabled())
-              Logging.connectors.debug("RSS: In Atom document '"+documentIdentifier+"', found an unincluded URL '"+rawURL+"'");
-          }
         }
       }
     }