You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2012/12/18 22:34:10 UTC
svn commit: r1423675 - in /manifoldcf/trunk: CHANGES.txt
connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
Author: kwright
Date: Tue Dec 18 21:34:09 2012
New Revision: 1423675
URL: http://svn.apache.org/viewvc?rev=1423675&view=rev
Log:
Fix for CONNECTORS-539. Parse multiple atom link tags per entry.
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1423675&r1=1423674&r2=1423675&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Tue Dec 18 21:34:09 2012
@@ -3,6 +3,10 @@ $Id$
======================= 1.1-dev =====================
+CONNECTORS-589: Parse multiple link tags inside an entry tag, for
+Atom feeds.
+(David Morana, Karl Wright)
+
CONNECTORS-543: Add testing infrastructure for combined war.
(Karl Wright)
Modified: manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java?rev=1423675&r1=1423674&r2=1423675&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java (original)
+++ manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java Tue Dec 18 21:34:09 2012
@@ -4500,7 +4500,7 @@ public class RSSConnector extends org.ap
protected class FeedItemContextClass extends XMLContext
{
protected int dechromedContentMode;
- protected String linkField = null;
+ protected List<String> linkField = new ArrayList<String>();
protected String pubDateField = null;
protected String titleField = null;
protected ArrayList categoryField = new ArrayList();
@@ -4520,7 +4520,9 @@ public class RSSConnector extends org.ap
if (qName.equals("link"))
{
// "link" tag
- linkField = atts.getValue("href");
+ String ref = atts.getValue("href");
+ if (ref != null && ref.length() > 0)
+ linkField.add(ref);
return super.beginTag(namespaceURI,localName,qName,atts);
}
else if (qName.equals("published") || qName.equals("updated"))
@@ -4675,101 +4677,104 @@ public class RSSConnector extends org.ap
public void process(String documentIdentifier, IProcessActivity activities, Filter filter)
throws ManifoldCFException
{
- if (linkField != null && linkField.length() > 0)
+ if (linkField.size() > 0)
{
Long origDate = null;
if (pubDateField != null && pubDateField.length() > 0)
origDate = parseZuluDate(pubDateField);
- String[] links = linkField.split(", ");
- int l = 0;
- while (l < links.length)
+ for (String linkValue : linkField)
{
- String rawURL = links[l++].trim();
- // Process the link
- String newIdentifier = makeDocumentIdentifier(filter.getCanonicalizationPolicies(),documentIdentifier,rawURL);
- if (newIdentifier != null)
- {
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("RSS: In Atom document '"+documentIdentifier+"', found a link to '"+newIdentifier+"', which has origination date "+
- ((origDate==null)?"null":origDate.toString()));
- if (filter.isLegalURL(newIdentifier))
+ String[] links = linkValue.split(", ");
+ int l = 0;
+ while (l < links.length)
+ {
+ String rawURL = links[l++].trim();
+ // Process the link
+ String newIdentifier = makeDocumentIdentifier(filter.getCanonicalizationPolicies(),documentIdentifier,rawURL);
+ if (newIdentifier != null)
{
- if (contentsFile == null)
- {
- // It's a reference! Add it.
- String[] dataNames = new String[]{"pubdate","title","source","category","description"};
- String[][] dataValues = new String[dataNames.length][];
- if (origDate != null)
- dataValues[0] = new String[]{origDate.toString()};
- if (titleField != null)
- dataValues[1] = new String[]{titleField};
- dataValues[2] = new String[]{documentIdentifier};
- dataValues[3] = new String[categoryField.size()];
- int q = 0;
- while (q < categoryField.size())
- {
- (dataValues[3])[q] = (String)categoryField.get(q);
- q++;
- }
- if (descriptionField != null)
- dataValues[4] = new String[]{descriptionField};
-
- // Add document reference, including the data to pass down
- activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
- }
- else
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("RSS: In Atom document '"+documentIdentifier+"', found a link to '"+newIdentifier+"', which has origination date "+
+ ((origDate==null)?"null":origDate.toString()));
+ if (filter.isLegalURL(newIdentifier))
{
- // The issue here is that if a document is ingested without a jobqueue entry, the document will not
- // be cleaned up if the job is deleted; nor is there any expiration possibility. So, we really do need to make
- // sure a jobqueue entry gets created somehow. Therefore I can't just ingest the document
- // right here.
-
- // Now, set up the carrydown info
- String[] dataNames = new String[]{"pubdate","title","source","category","data","description"};
- Object[][] dataValues = new Object[dataNames.length][];
- if (origDate != null)
- dataValues[0] = new String[]{origDate.toString()};
- if (titleField != null)
- dataValues[1] = new String[]{titleField};
- dataValues[2] = new String[]{documentIdentifier};
- dataValues[3] = new String[categoryField.size()];
- int q = 0;
- while (q < categoryField.size())
- {
- (dataValues[3])[q] = (String)categoryField.get(q);
- q++;
- }
- if (descriptionField != null)
- dataValues[5] = new String[]{descriptionField};
-
- CharacterInput ci = new TempFileCharacterInput(contentsFile);
- try
+ if (contentsFile == null)
{
- contentsFile = null;
-
- dataValues[4] = new Object[]{ci};
-
- // Add document reference, including the data to pass down, and the dechromed content too
+ // It's a reference! Add it.
+ String[] dataNames = new String[]{"pubdate","title","source","category","description"};
+ String[][] dataValues = new String[dataNames.length][];
+ if (origDate != null)
+ dataValues[0] = new String[]{origDate.toString()};
+ if (titleField != null)
+ dataValues[1] = new String[]{titleField};
+ dataValues[2] = new String[]{documentIdentifier};
+ dataValues[3] = new String[categoryField.size()];
+ int q = 0;
+ while (q < categoryField.size())
+ {
+ (dataValues[3])[q] = (String)categoryField.get(q);
+ q++;
+ }
+ if (descriptionField != null)
+ dataValues[4] = new String[]{descriptionField};
+
+ // Add document reference, including the data to pass down
activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
}
- finally
+ else
{
- ci.discard();
+ // The issue here is that if a document is ingested without a jobqueue entry, the document will not
+ // be cleaned up if the job is deleted; nor is there any expiration possibility. So, we really do need to make
+ // sure a jobqueue entry gets created somehow. Therefore I can't just ingest the document
+ // right here.
+
+ // Now, set up the carrydown info
+ String[] dataNames = new String[]{"pubdate","title","source","category","data","description"};
+ Object[][] dataValues = new Object[dataNames.length][];
+ if (origDate != null)
+ dataValues[0] = new String[]{origDate.toString()};
+ if (titleField != null)
+ dataValues[1] = new String[]{titleField};
+ dataValues[2] = new String[]{documentIdentifier};
+ dataValues[3] = new String[categoryField.size()];
+ int q = 0;
+ while (q < categoryField.size())
+ {
+ (dataValues[3])[q] = (String)categoryField.get(q);
+ q++;
+ }
+ if (descriptionField != null)
+ dataValues[5] = new String[]{descriptionField};
+
+ CharacterInput ci = new TempFileCharacterInput(contentsFile);
+ try
+ {
+ contentsFile = null;
+
+ dataValues[4] = new Object[]{ci};
+
+ // Add document reference, including the data to pass down, and the dechromed content too
+ activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
+ }
+ finally
+ {
+ ci.discard();
+ }
}
}
+ else
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("RSS: Identifier '"+newIdentifier+"' is excluded");
+ }
}
else
{
if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("RSS: Identifier '"+newIdentifier+"' is excluded");
+ Logging.connectors.debug("RSS: In Atom document '"+documentIdentifier+"', found an unincluded URL '"+rawURL+"'");
}
}
- else
- {
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("RSS: In Atom document '"+documentIdentifier+"', found an unincluded URL '"+rawURL+"'");
- }
}
}
}