You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ma...@apache.org on 2012/02/09 10:55:09 UTC
svn commit: r1242255 - in /nutch/trunk: ./
src/plugin/subcollection/src/java/org/apache/nutch/collection/
src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/
Author: markus
Date: Thu Feb 9 09:55:08 2012
New Revision: 1242255
URL: http://svn.apache.org/viewvc?rev=1242255&view=rev
Log:
NUTCH-1266 Subcollection to optionally write to configured fields
Modified:
nutch/trunk/CHANGES.txt
nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1242255&r1=1242254&r2=1242255&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Feb 9 09:55:08 2012
@@ -1,5 +1,7 @@
Nutch Change Log
+* NUTCH-1266 Subcollection to optionally write to configured fields (markus)
+
* NUTCH-1005 Parse headings plugin (markus)
* NUTCH-1264 Configurable indexing plugin index-metadata (jnioche)
Modified: nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java?rev=1242255&r1=1242254&r2=1242255&view=diff
==============================================================================
--- nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java (original)
+++ nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java Thu Feb 9 09:55:08 2012
@@ -165,25 +165,25 @@ public class CollectionManager extends C
/**
* Return names of collections url is part of
- *
+ *
* @param url
* The url to test against Collections
- * @return Space delimited string of collection names url is part of
+ * @return Subcollections
*/
- public List<String> getSubCollections(final String url) {
- List<String> collections = new ArrayList<String>();
+ public List<Subcollection> getSubCollections(final String url) {
+ List<Subcollection> collections = new ArrayList<Subcollection>();
final Iterator iterator = collectionMap.values().iterator();
while (iterator.hasNext()) {
final Subcollection subCol = (Subcollection) iterator.next();
if (subCol.filter(url) != null) {
- collections.add(subCol.name);
+ collections.add(subCol);
}
}
- if (LOG.isTraceEnabled()) {
- LOG.trace("subcollections:" + Arrays.toString(collections.toArray()));
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("subcollections:" + Arrays.toString(collections.toArray()));
}
-
+
return collections;
}
Modified: nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java?rev=1242255&r1=1242254&r2=1242255&view=diff
==============================================================================
--- nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java (original)
+++ nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java Thu Feb 9 09:55:08 2012
@@ -38,6 +38,7 @@ public class Subcollection extends Confi
public static final String TAG_WHITELIST="whitelist";
public static final String TAG_BLACKLIST="blacklist";
public static final String TAG_NAME="name";
+ public static final String TAG_KEY="key";
public static final String TAG_ID="id";
ArrayList blackList = new ArrayList();
@@ -49,6 +50,11 @@ public class Subcollection extends Confi
*/
String id;
+ /**
+ * SubCollection key
+ */
+ String key;
+
/**
* SubCollection name
*/
@@ -70,8 +76,18 @@ public class Subcollection extends Confi
* @param name name of SubCollection
*/
public Subcollection(String id, String name, Configuration conf) {
+ this(id, name, null, conf);
+ }
+
+ /** public Constructor
+ *
+ * @param id id of SubCollection
+ * @param name name of SubCollection
+ */
+ public Subcollection(String id, String name, String key, Configuration conf) {
this(conf);
this.id=id;
+ this.key = key;
this.name = name;
}
@@ -87,6 +103,13 @@ public class Subcollection extends Confi
}
/**
+ * @return Returns the key
+ */
+ public String getKey() {
+ return key;
+ }
+
+ /**
* @return Returns the id
*/
public String getId() {
@@ -180,6 +203,12 @@ public class Subcollection extends Confi
this.blString = DOMUtil.getChildText(nodeList.item(0)).trim();
parseList(this.blackList, blString);
}
+
+ // Check if there's a key element or set default name
+ nodeList = collection.getElementsByTagName(TAG_KEY);
+ if (nodeList.getLength() == 1) {
+ this.key = DOMUtil.getChildText(nodeList.item(0)).trim();
+ }
}
/**
Modified: nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java?rev=1242255&r1=1242254&r2=1242255&view=diff
==============================================================================
--- nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java Thu Feb 9 09:55:08 2012
@@ -31,6 +31,7 @@ import org.apache.nutch.indexer.Indexing
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.collection.CollectionManager;
+import org.apache.nutch.collection.Subcollection;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
@@ -62,8 +63,12 @@ public class SubcollectionIndexingFilter
* @param url
*/
private void addSubCollectionField(NutchDocument doc, String url) {
- for (String collname: CollectionManager.getCollectionManager(getConf()).getSubCollections(url)) {
- doc.add(FIELD_NAME, collname);
+ for (Subcollection coll : CollectionManager.getCollectionManager(getConf()).getSubCollections(url)) {
+ if (coll.getKey() == null) {
+ doc.add(FIELD_NAME, coll.getName());
+ } else {
+ doc.add(coll.getKey(), coll.getName());
+ }
}
}