You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/29 06:39:03 UTC
svn commit: r1655526 [22/26] - in /nutch/trunk: ./
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/
src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/p...
Modified: nutch/trunk/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java (original)
+++ nutch/trunk/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/package-info.java Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
* (number of "hops" from seed URLs).
*/
package org.apache.nutch.scoring.depth;
+
Modified: nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java (original)
+++ nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/LinkAnalysisScoringFilter.java Thu Jan 29 05:38:59 2015
@@ -32,8 +32,7 @@ import org.apache.nutch.protocol.Content
import org.apache.nutch.scoring.ScoringFilter;
import org.apache.nutch.scoring.ScoringFilterException;
-public class LinkAnalysisScoringFilter
- implements ScoringFilter {
+public class LinkAnalysisScoringFilter implements ScoringFilter {
private Configuration conf;
private float normalizedScore = 1.00f;
@@ -52,46 +51,44 @@ public class LinkAnalysisScoringFilter
}
public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
- ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
- CrawlDatum adjust, int allCount)
- throws ScoringFilterException {
+ ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+ CrawlDatum adjust, int allCount) throws ScoringFilterException {
return adjust;
}
public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
- throws ScoringFilterException {
+ throws ScoringFilterException {
return datum.getScore() * initSort;
}
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
- CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
- throws ScoringFilterException {
+ CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+ throws ScoringFilterException {
return (normalizedScore * dbDatum.getScore());
}
public void initialScore(Text url, CrawlDatum datum)
- throws ScoringFilterException {
+ throws ScoringFilterException {
datum.setScore(0.0f);
}
public void injectedScore(Text url, CrawlDatum datum)
- throws ScoringFilterException {
+ throws ScoringFilterException {
}
public void passScoreAfterParsing(Text url, Content content, Parse parse)
- throws ScoringFilterException {
- parse.getData().getContentMeta().set(Nutch.SCORE_KEY,
- content.getMetadata().get(Nutch.SCORE_KEY));
+ throws ScoringFilterException {
+ parse.getData().getContentMeta()
+ .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
}
public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
- throws ScoringFilterException {
+ throws ScoringFilterException {
content.getMetadata().set(Nutch.SCORE_KEY, "" + datum.getScore());
}
public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
- List<CrawlDatum> inlinked)
- throws ScoringFilterException {
+ List<CrawlDatum> inlinked) throws ScoringFilterException {
// nothing to do
}
Modified: nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java (original)
+++ nutch/trunk/src/plugin/scoring-link/src/java/org/apache/nutch/scoring/link/package-info.java Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
* {@link org.apache.nutch.scoring.webgraph.WebGraph}.
*/
package org.apache.nutch.scoring.link;
+
Modified: nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java (original)
+++ nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java Thu Jan 29 05:38:59 2015
@@ -41,17 +41,17 @@ import org.apache.nutch.scoring.ScoringF
/**
* This plugin implements a variant of an Online Page Importance Computation
- * (OPIC) score, described in this paper:
- * <a href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html"/>
- * Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003),
- * Adaptive On-Line Page Importance Computation
- * </a>.
+ * (OPIC) score, described in this paper: <a
+ * href="http://www2003.org/cdrom/papers/refereed/p007/p7-abiteboul.html"/>
+ * Abiteboul, Serge and Preda, Mihai and Cobena, Gregory (2003), Adaptive
+ * On-Line Page Importance Computation </a>.
*
* @author Andrzej Bialecki
*/
public class OPICScoringFilter implements ScoringFilter {
- private final static Logger LOG = LoggerFactory.getLogger(OPICScoringFilter.class);
+ private final static Logger LOG = LoggerFactory
+ .getLogger(OPICScoringFilter.class);
private Configuration conf;
private float scoreInjected;
@@ -72,28 +72,35 @@ public class OPICScoringFilter implement
countFiltered = conf.getBoolean("db.score.count.filtered", false);
}
- public void injectedScore(Text url, CrawlDatum datum) throws ScoringFilterException {
+ public void injectedScore(Text url, CrawlDatum datum)
+ throws ScoringFilterException {
}
- /** Set to 0.0f (unknown value) - inlink contributions will bring it to
- * a correct level. Newly discovered pages have at least one inlink. */
- public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException {
+ /**
+ * Set to 0.0f (unknown value) - inlink contributions will bring it to a
+ * correct level. Newly discovered pages have at least one inlink.
+ */
+ public void initialScore(Text url, CrawlDatum datum)
+ throws ScoringFilterException {
datum.setScore(0.0f);
}
/** Use {@link CrawlDatum#getScore()}. */
- public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException {
+ public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+ throws ScoringFilterException {
return datum.getScore() * initSort;
}
/** Increase the score by a sum of inlinked scores. */
- public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List<CrawlDatum> inlinked) throws ScoringFilterException {
+ public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+ List<CrawlDatum> inlinked) throws ScoringFilterException {
float adjust = 0.0f;
for (int i = 0; i < inlinked.size(); i++) {
CrawlDatum linked = inlinked.get(i);
adjust += linked.getScore();
}
- if (old == null) old = datum;
+ if (old == null)
+ old = datum;
datum.setScore(old.getScore() + adjust);
}
@@ -104,11 +111,17 @@ public class OPICScoringFilter implement
/** Copy the value from Content metadata under Fetcher.SCORE_KEY to parseData. */
public void passScoreAfterParsing(Text url, Content content, Parse parse) {
- parse.getData().getContentMeta().set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
+ parse.getData().getContentMeta()
+ .set(Nutch.SCORE_KEY, content.getMetadata().get(Nutch.SCORE_KEY));
}
- /** Get a float value from Fetcher.SCORE_KEY, divide it by the number of outlinks and apply. */
- public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, int allCount) throws ScoringFilterException {
+ /**
+ * Get a float value from Fetcher.SCORE_KEY, divide it by the number of
+ * outlinks and apply.
+ */
+ public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+ ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+ CrawlDatum adjust, int allCount) throws ScoringFilterException {
float score = scoreInjected;
String scoreString = parseData.getContentMeta().get(Nutch.SCORE_KEY);
if (scoreString != null) {
@@ -135,7 +148,7 @@ public class OPICScoringFilter implement
try {
String toHost = new URL(target.getKey().toString()).getHost();
String fromHost = new URL(fromUrl.toString()).getHost();
- if(toHost.equalsIgnoreCase(fromHost)){
+ if (toHost.equalsIgnoreCase(fromHost)) {
target.getValue().setScore(internalScore);
} else {
target.getValue().setScore(externalScore);
@@ -151,8 +164,10 @@ public class OPICScoringFilter implement
return adjust;
}
- /** Dampen the boost value by scorePower.*/
- public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException {
- return (float)Math.pow(dbDatum.getScore(), scorePower) * initScore;
+ /** Dampen the boost value by scorePower. */
+ public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+ CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+ throws ScoringFilterException {
+ return (float) Math.pow(dbDatum.getScore(), scorePower) * initScore;
}
}
Modified: nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java (original)
+++ nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/package-info.java Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
* (OPIC) algorithm.
*/
package org.apache.nutch.scoring.opic;
+
Modified: nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java (original)
+++ nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/CollectionManager.java Thu Jan 29 05:38:59 2015
@@ -52,22 +52,24 @@ public class CollectionManager extends C
transient Map<String, Subcollection> collectionMap = new HashMap<String, Subcollection>();
transient URL configfile;
-
+
public CollectionManager(Configuration conf) {
super(conf);
init();
}
-
- /**
+
+ /**
* Used for testing
*/
- protected CollectionManager(){
+ protected CollectionManager() {
super(NutchConfiguration.create());
}
- protected void init(){
+ protected void init() {
try {
- if (LOG.isInfoEnabled()) { LOG.info("initializing CollectionManager"); }
+ if (LOG.isInfoEnabled()) {
+ LOG.info("initializing CollectionManager");
+ }
// initialize known subcollections
configfile = getConf().getResource(
getConf().get("subcollections.config", DEFAULT_FILE_NAME));
@@ -92,7 +94,7 @@ public class CollectionManager extends C
if (LOG.isInfoEnabled()) {
LOG.info("file has " + nodeList.getLength() + " elements");
}
-
+
for (int i = 0; i < nodeList.getLength(); i++) {
Element scElem = (Element) nodeList.item(i);
Subcollection subCol = new Subcollection(getConf());
@@ -103,18 +105,18 @@ public class CollectionManager extends C
LOG.info("Cannot find collections");
}
}
-
+
public static CollectionManager getCollectionManager(Configuration conf) {
String key = "collectionmanager";
ObjectCache objectCache = ObjectCache.get(conf);
- CollectionManager impl = (CollectionManager)objectCache.getObject(key);
+ CollectionManager impl = (CollectionManager) objectCache.getObject(key);
if (impl == null) {
try {
if (LOG.isInfoEnabled()) {
LOG.info("Instantiating CollectionManager");
}
- impl=new CollectionManager(conf);
- objectCache.setObject(key,impl);
+ impl = new CollectionManager(conf);
+ objectCache.setObject(key, impl);
} catch (Exception e) {
throw new RuntimeException("Couldn't create CollectionManager", e);
}
@@ -165,7 +167,7 @@ public class CollectionManager extends C
/**
* Return names of collections url is part of
- *
+ *
* @param url
* The url to test against Collections
* @return Subcollections
@@ -203,8 +205,8 @@ public class CollectionManager extends C
*/
public void save() throws IOException {
try {
- final FileOutputStream fos = new FileOutputStream(new File(configfile
- .getFile()));
+ final FileOutputStream fos = new FileOutputStream(new File(
+ configfile.getFile()));
final Document doc = new DocumentImpl();
final Element collections = doc
.createElement(Subcollection.TAG_COLLECTIONS);
Modified: nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java (original)
+++ nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/collection/Subcollection.java Thu Jan 29 05:38:59 2015
@@ -32,20 +32,20 @@ import org.w3c.dom.NodeList;
* SubCollection represents a subset of index, you can define url patterns that
* will indicate that particular page (url) is part of SubCollection.
*/
-public class Subcollection extends Configured implements URLFilter{
-
- public static final String TAG_COLLECTIONS="subcollections";
- public static final String TAG_COLLECTION="subcollection";
- public static final String TAG_WHITELIST="whitelist";
- public static final String TAG_BLACKLIST="blacklist";
- public static final String TAG_NAME="name";
- public static final String TAG_KEY="key";
- public static final String TAG_ID="id";
+public class Subcollection extends Configured implements URLFilter {
+
+ public static final String TAG_COLLECTIONS = "subcollections";
+ public static final String TAG_COLLECTION = "subcollection";
+ public static final String TAG_WHITELIST = "whitelist";
+ public static final String TAG_BLACKLIST = "blacklist";
+ public static final String TAG_NAME = "name";
+ public static final String TAG_KEY = "key";
+ public static final String TAG_ID = "id";
List<String> blackList = new ArrayList<String>();
List<String> whiteList = new ArrayList<String>();
- /**
+ /**
* SubCollection identifier
*/
String id;
@@ -55,12 +55,12 @@ public class Subcollection extends Confi
*/
String key;
- /**
+ /**
* SubCollection name
*/
String name;
- /**
+ /**
* SubCollection whitelist as String
*/
String wlString;
@@ -70,31 +70,37 @@ public class Subcollection extends Confi
*/
String blString;
- /** public Constructor
+ /**
+ * public Constructor
*
- * @param id id of SubCollection
- * @param name name of SubCollection
+ * @param id
+ * id of SubCollection
+ * @param name
+ * name of SubCollection
*/
public Subcollection(String id, String name, Configuration conf) {
this(id, name, null, conf);
}
- /** public Constructor
- *
- * @param id id of SubCollection
- * @param name name of SubCollection
+ /**
+ * public Constructor
+ *
+ * @param id
+ * id of SubCollection
+ * @param name
+ * name of SubCollection
*/
public Subcollection(String id, String name, String key, Configuration conf) {
this(conf);
- this.id=id;
+ this.id = id;
this.key = key;
this.name = name;
}
- public Subcollection(Configuration conf){
+ public Subcollection(Configuration conf) {
super(conf);
}
-
+
/**
* @return Returns the name
*/
@@ -232,7 +238,8 @@ public class Subcollection extends Confi
/**
* Set contents of blacklist from String
*
- * @param list the blacklist contents
+ * @param list
+ * the blacklist contents
*/
public void setBlackList(String list) {
this.blString = list;
@@ -242,7 +249,8 @@ public class Subcollection extends Confi
/**
* Set contents of whitelist from String
*
- * @param list the whitelist contents
+ * @param list
+ * the whitelist contents
*/
public void setWhiteList(String list) {
this.wlString = list;
Modified: nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java Thu Jan 29 05:38:59 2015
@@ -35,21 +35,22 @@ import org.apache.nutch.collection.Subco
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
-
-public class SubcollectionIndexingFilter extends Configured implements IndexingFilter {
+public class SubcollectionIndexingFilter extends Configured implements
+ IndexingFilter {
private Configuration conf;
- public SubcollectionIndexingFilter(){
+ public SubcollectionIndexingFilter() {
super(NutchConfiguration.create());
}
-
+
public SubcollectionIndexingFilter(Configuration conf) {
super(conf);
}
-
+
/**
- * @param Configuration conf
+ * @param Configuration
+ * conf
*/
public void setConf(Configuration conf) {
this.conf = conf;
@@ -63,7 +64,6 @@ public class SubcollectionIndexingFilter
return this.conf;
}
-
/**
* Doc field name
*/
@@ -72,7 +72,8 @@ public class SubcollectionIndexingFilter
/**
* Logger
*/
- public static final Logger LOG = LoggerFactory.getLogger(SubcollectionIndexingFilter.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(SubcollectionIndexingFilter.class);
/**
* "Mark" document to be a part of subcollection
@@ -81,7 +82,8 @@ public class SubcollectionIndexingFilter
* @param url
*/
private void addSubCollectionField(NutchDocument doc, String url) {
- for (Subcollection coll : CollectionManager.getCollectionManager(getConf()).getSubCollections(url)) {
+ for (Subcollection coll : CollectionManager.getCollectionManager(getConf())
+ .getSubCollections(url)) {
if (coll.getKey() == null) {
doc.add(fieldName, coll.getName());
} else {
@@ -90,7 +92,8 @@ public class SubcollectionIndexingFilter
}
}
- public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
String sUrl = url.toString();
addSubCollectionField(doc, sUrl);
return doc;
Modified: nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/package-info.java (original)
+++ nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/package-info.java Thu Jan 29 05:38:59 2015
@@ -22,3 +22,4 @@
* {@link org.apache.nutch.collection}.
*/
package org.apache.nutch.indexer.subcollection;
+
Modified: nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java (original)
+++ nutch/trunk/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java Thu Jan 29 05:38:59 2015
@@ -25,31 +25,34 @@ import org.junit.Assert;
import org.junit.Test;
public class TestSubcollection {
-
- /**Test filtering logic
+
+ /**
+ * Test filtering logic
*
* @throws Exception
*/
@Test
public void testFilter() throws Exception {
- Subcollection sc=new Subcollection(NutchConfiguration.create());
+ Subcollection sc = new Subcollection(NutchConfiguration.create());
sc.setWhiteList("www.nutch.org\nwww.apache.org");
sc.setBlackList("jpg\nwww.apache.org/zecret/");
-
- //matches whitelist
- Assert.assertEquals("http://www.apache.org/index.html", sc.filter("http://www.apache.org/index.html"));
-
- //matches blacklist
- Assert.assertEquals(null, sc.filter("http://www.apache.org/zecret/index.html"));
+
+ // matches whitelist
+ Assert.assertEquals("http://www.apache.org/index.html",
+ sc.filter("http://www.apache.org/index.html"));
+
+ // matches blacklist
+ Assert.assertEquals(null,
+ sc.filter("http://www.apache.org/zecret/index.html"));
Assert.assertEquals(null, sc.filter("http://www.apache.org/img/image.jpg"));
-
- //no match
+
+ // no match
Assert.assertEquals(null, sc.filter("http://www.google.com/"));
}
-
+
@Test
- public void testInput(){
- StringBuffer xml=new StringBuffer();
+ public void testInput() {
+ StringBuffer xml = new StringBuffer();
xml.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
xml.append("<!-- just a comment -->");
xml.append("<subcollections>");
@@ -65,44 +68,45 @@ public class TestSubcollection {
xml.append("</blacklist>");
xml.append("</subcollection>");
xml.append("</subcollections>");
-
- InputStream is=new ByteArrayInputStream(xml.toString().getBytes());
-
- CollectionManager cm=new CollectionManager();
+
+ InputStream is = new ByteArrayInputStream(xml.toString().getBytes());
+
+ CollectionManager cm = new CollectionManager();
cm.parse(is);
-
- Collection<?> c=cm.getAll();
-
+
+ Collection<?> c = cm.getAll();
+
// test that size matches
- Assert.assertEquals(1,c.size());
-
- Subcollection collection=(Subcollection)c.toArray()[0];
-
- //test collection id
+ Assert.assertEquals(1, c.size());
+
+ Subcollection collection = (Subcollection) c.toArray()[0];
+
+ // test collection id
Assert.assertEquals("nutch", collection.getId());
-
- //test collection name
+
+ // test collection name
Assert.assertEquals("nutch collection", collection.getName());
- //test whitelist
- Assert.assertEquals(2,collection.whiteList.size());
-
- String wlUrl=(String)collection.whiteList.get(0);
+ // test whitelist
+ Assert.assertEquals(2, collection.whiteList.size());
+
+ String wlUrl = (String) collection.whiteList.get(0);
Assert.assertEquals("http://lucene.apache.org/nutch/", wlUrl);
- wlUrl=(String)collection.whiteList.get(1);
+ wlUrl = (String) collection.whiteList.get(1);
Assert.assertEquals("http://wiki.apache.org/nutch/", wlUrl);
-
- //matches whitelist
- Assert.assertEquals("http://lucene.apache.org/nutch/", collection.filter("http://lucene.apache.org/nutch/"));
- //test blacklist
- Assert.assertEquals(1,collection.blackList.size());
+ // matches whitelist
+ Assert.assertEquals("http://lucene.apache.org/nutch/",
+ collection.filter("http://lucene.apache.org/nutch/"));
+
+ // test blacklist
+ Assert.assertEquals(1, collection.blackList.size());
- String blUrl=(String)collection.blackList.get(0);
+ String blUrl = (String) collection.blackList.get(0);
Assert.assertEquals("http://www.xxx.yyy", blUrl);
- //no match
+ // no match
Assert.assertEquals(null, collection.filter("http://www.google.com/"));
}
}
Modified: nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java Thu Jan 29 05:38:59 2015
@@ -34,23 +34,25 @@ import org.apache.nutch.util.domain.Doma
/**
* Adds the Top level domain extensions to the index
+ *
* @author Enis Soztutar <enis.soz.nutch@gmail.com>
*/
public class TLDIndexingFilter implements IndexingFilter {
- public static final Logger LOG = LoggerFactory.getLogger(TLDIndexingFilter.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(TLDIndexingFilter.class);
private Configuration conf;
- public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks)
- throws IndexingException {
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
try {
URL url = new URL(urlText.toString());
DomainSuffix d = URLUtil.getDomainSuffix(url);
-
+
doc.add("tld", d.getDomain());
-
- }catch (Exception ex) {
+
+ } catch (Exception ex) {
LOG.warn(ex.toString());
}
Modified: nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java (original)
+++ nutch/trunk/src/plugin/tld/src/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java Thu Jan 29 05:38:59 2015
@@ -35,9 +35,9 @@ import org.apache.nutch.scoring.ScoringF
import org.apache.nutch.util.domain.DomainSuffix;
import org.apache.nutch.util.domain.DomainSuffixes;
-
/**
* Scoring filter to boost tlds.
+ *
* @author Enis Soztutar <enis.soz.nutch@gmail.com>
*/
public class TLDScoringFilter implements ScoringFilter {
@@ -56,10 +56,10 @@ public class TLDScoringFilter implements
NutchField tlds = doc.getField("tld");
float boost = 1.0f;
- if(tlds != null) {
- for(Object tld : tlds.getValues()) {
+ if (tlds != null) {
+ for (Object tld : tlds.getValues()) {
DomainSuffix entry = tldEntries.get(tld.toString());
- if(entry != null)
+ if (entry != null)
boost *= entry.getBoost();
}
}
@@ -93,9 +93,8 @@ public class TLDScoringFilter implements
throws ScoringFilterException {
}
- public void updateDbScore(Text url, CrawlDatum old,
- CrawlDatum datum, List<CrawlDatum> inlinked)
- throws ScoringFilterException {
+ public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+ List<CrawlDatum> inlinked) throws ScoringFilterException {
}
public Configuration getConf() {
@@ -105,9 +104,10 @@ public class TLDScoringFilter implements
public void setConf(Configuration conf) {
this.conf = conf;
}
- public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData,
- Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust,
- int allCount) throws ScoringFilterException {
+
+ public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+ ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+ CrawlDatum adjust, int allCount) throws ScoringFilterException {
return adjust;
}
Modified: nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-automaton/src/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java Thu Jan 29 05:38:59 2015
@@ -32,12 +32,11 @@ import org.apache.nutch.net.*;
import org.apache.nutch.urlfilter.api.RegexRule;
import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
-
/**
- * RegexURLFilterBase implementation based on the
- * <a href="http://www.brics.dk/automaton/">dk.brics.automaton</a>
- * Finite-State Automata for Java<sup>TM</sup>.
- *
+ * RegexURLFilterBase implementation based on the <a
+ * href="http://www.brics.dk/automaton/">dk.brics.automaton</a> Finite-State
+ * Automata for Java<sup>TM</sup>.
+ *
* @author Jérôme Charron
* @see <a href="http://www.brics.dk/automaton/">dk.brics.automaton</a>
*/
@@ -49,24 +48,24 @@ public class AutomatonURLFilter extends
super();
}
- public AutomatonURLFilter(String filename)
- throws IOException, PatternSyntaxException {
+ public AutomatonURLFilter(String filename) throws IOException,
+ PatternSyntaxException {
super(filename);
}
- AutomatonURLFilter(Reader reader)
- throws IOException, IllegalArgumentException {
+ AutomatonURLFilter(Reader reader) throws IOException,
+ IllegalArgumentException {
super(reader);
}
-
- /* ----------------------------------- *
- * <implementation:RegexURLFilterBase> *
- * ----------------------------------- */
-
+ /*
+ * ----------------------------------- * <implementation:RegexURLFilterBase> *
+ * -----------------------------------
+ */
+
/**
- * Rules specified as a config property will override rules specified
- * as a config file.
+ * Rules specified as a config property will override rules specified as a
+ * config file.
*/
protected Reader getRulesReader(Configuration conf) throws IOException {
String stringRules = conf.get(URLFILTER_AUTOMATON_RULES);
@@ -81,21 +80,20 @@ public class AutomatonURLFilter extends
protected RegexRule createRule(boolean sign, String regex) {
return new Rule(sign, regex);
}
-
- /* ------------------------------------ *
- * </implementation:RegexURLFilterBase> *
- * ------------------------------------ */
-
+ /*
+ * ------------------------------------ * </implementation:RegexURLFilterBase>
+ * * ------------------------------------
+ */
+
public static void main(String args[]) throws IOException {
main(new AutomatonURLFilter(), args);
}
-
private class Rule extends RegexRule {
-
+
private RunAutomaton automaton;
-
+
Rule(boolean sign, String regex) {
super(sign, regex);
automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton());
@@ -105,5 +103,5 @@ public class AutomatonURLFilter extends
return automaton.run(url);
}
}
-
+
}
Modified: nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java Thu Jan 29 05:38:59 2015
@@ -26,10 +26,9 @@ import org.apache.nutch.urlfilter.api.Re
import org.junit.Assert;
import org.junit.Test;
-
/**
* JUnit based test of class <code>AutomatonURLFilter</code>.
- *
+ *
* @author Jérôme Charron
*/
public class TestAutomatonURLFilter extends RegexURLFilterBaseTest {
@@ -42,7 +41,7 @@ public class TestAutomatonURLFilter exte
return null;
}
}
-
+
@Test
public void test() {
test("WholeWebCrawling");
Modified: nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java Thu Jan 29 05:38:59 2015
@@ -35,35 +35,48 @@ import org.apache.nutch.util.URLUtil;
import org.apache.nutch.util.domain.DomainSuffix;
/**
- * <p>Filters URLs based on a file containing domain suffixes, domain names, and
+ * <p>
+ * Filters URLs based on a file containing domain suffixes, domain names, and
* hostnames. Only a url that matches one of the suffixes, domains, or hosts
- * present in the file is allowed.</p>
+ * present in the file is allowed.
+ * </p>
*
- * <p>Urls are checked in order of domain suffix, domain name, and hostname
- * against entries in the domain file. The domain file would be setup as follows
- * with one entry per line:
- *
- * <pre> com apache.org www.apache.org </pre>
- *
- * <p>The first line is an example of a filter that would allow all .com
- * domains. The second line allows all urls from apache.org and all of its
- * subdomains such as lucene.apache.org and hadoop.apache.org. The third line
- * would allow only urls from www.apache.org. There is no specific ordering to
- * entries. The entries are from more general to more specific with the more
- * general overridding the more specific.</p>
+ * <p>
+ * Urls are checked in order of domain suffix, domain name, and hostname against
+ * entries in the domain file. The domain file would be setup as follows with
+ * one entry per line:
+ *
+ * <pre>
+ * com apache.org www.apache.org
+ * </pre>
+ *
+ * <p>
+ * The first line is an example of a filter that would allow all .com domains.
+ * The second line allows all urls from apache.org and all of its subdomains
+ * such as lucene.apache.org and hadoop.apache.org. The third line would allow
+ * only urls from www.apache.org. There is no specific ordering to entries. The
+ * entries are from more general to more specific with the more general
+ * overridding the more specific.
+ * </p>
*
* The domain file defaults to domain-urlfilter.txt in the classpath but can be
* overridden using the:
*
- * <ul> <ol>property "urlfilter.domain.file" in ./conf/nutch-*.xml, and</ol>
- * <ol>attribute "file" in plugin.xml of this plugin</ol> </ul>
+ * <ul>
+ * <ol>
+ * property "urlfilter.domain.file" in ./conf/nutch-*.xml, and
+ * </ol>
+ * <ol>
+ * attribute "file" in plugin.xml of this plugin
+ * </ol>
+ * </ul>
*
* the attribute "file" has higher precedence if defined.
*/
-public class DomainURLFilter
- implements URLFilter {
+public class DomainURLFilter implements URLFilter {
- private static final Logger LOG = LoggerFactory.getLogger(DomainURLFilter.class);
+ private static final Logger LOG = LoggerFactory
+ .getLogger(DomainURLFilter.class);
// read in attribute "file" of this plugin.
private static String attributeFile = null;
@@ -71,8 +84,7 @@ public class DomainURLFilter
private String domainFile = null;
private Set<String> domainSet = new LinkedHashSet<String>();
- private void readConfiguration(Reader configReader)
- throws IOException {
+ private void readConfiguration(Reader configReader) throws IOException {
// read the configuration file, line by line
BufferedReader reader = new BufferedReader(configReader);
@@ -95,7 +107,8 @@ public class DomainURLFilter
/**
* Constructor that specifies the domain file to use.
*
- * @param domainFile The domain file, overrides domain-urlfilter.text default.
+ * @param domainFile
+ * The domain file, overrides domain-urlfilter.text default.
*
* @throws IOException
*/
@@ -111,8 +124,8 @@ public class DomainURLFilter
// get the extensions for domain urlfilter
String pluginName = "urlfilter-domain";
- Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
- URLFilter.class.getName()).getExtensions();
+ Extension[] extensions = PluginRepository.get(conf)
+ .getExtensionPoint(URLFilter.class.getName()).getExtensions();
for (int i = 0; i < extensions.length; i++) {
Extension extension = extensions[i];
if (extension.getDescriptor().getPluginId().equals(pluginName)) {
@@ -120,32 +133,30 @@ public class DomainURLFilter
break;
}
}
-
+
// handle blank non empty input
if (attributeFile != null && attributeFile.trim().equals("")) {
attributeFile = null;
}
-
+
if (attributeFile != null) {
if (LOG.isInfoEnabled()) {
LOG.info("Attribute \"file\" is defined for plugin " + pluginName
- + " as " + attributeFile);
+ + " as " + attributeFile);
}
- }
- else {
+ } else {
if (LOG.isWarnEnabled()) {
LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
- + pluginName);
+ + pluginName);
}
}
// domain file and attribute "file" take precedence if defined
- String file = conf.get("urlfilter.domain.file");
+ String file = conf.get("urlfilter.domain.file");
String stringRules = conf.get("urlfilter.domain.rules");
if (domainFile != null) {
file = domainFile;
- }
- else if (attributeFile != null) {
+ } else if (attributeFile != null) {
file = attributeFile;
}
Reader reader = null;
@@ -159,8 +170,7 @@ public class DomainURLFilter
reader = new FileReader(file);
}
readConfiguration(reader);
- }
- catch (IOException e) {
+ } catch (IOException e) {
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
}
@@ -173,7 +183,7 @@ public class DomainURLFilter
try {
- // match for suffix, domain, and host in that order. more general will
+ // match for suffix, domain, and host in that order. more general will
// override more specific
String domain = URLUtil.getDomainName(url).toLowerCase().trim();
String host = URLUtil.getHost(url);
@@ -182,20 +192,19 @@ public class DomainURLFilter
if (domainSuffix != null) {
suffix = domainSuffix.getDomain();
}
-
+
if (domainSet.contains(suffix) || domainSet.contains(domain)
- || domainSet.contains(host)) {
+ || domainSet.contains(host)) {
return url;
}
// doesn't match, don't allow
return null;
- }
- catch (Exception e) {
-
+ } catch (Exception e) {
+
// if an error happens, allow the url to pass
LOG.error("Could not apply filter on url: " + url + "\n"
- + org.apache.hadoop.util.StringUtils.stringifyException(e));
+ + org.apache.hadoop.util.StringUtils.stringifyException(e));
return null;
}
}
Modified: nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java Thu Jan 29 05:38:59 2015
@@ -23,13 +23,11 @@ import org.junit.Test;
public class TestDomainURLFilter {
-
private final static String SEPARATOR = System.getProperty("file.separator");
private final static String SAMPLES = System.getProperty("test.data", ".");
@Test
- public void testFilter()
- throws Exception {
+ public void testFilter() throws Exception {
String domainFile = SAMPLES + SEPARATOR + "hosts.txt";
Configuration conf = NutchConfiguration.create();
Modified: nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java Thu Jan 29 05:38:59 2015
@@ -35,35 +35,48 @@ import org.apache.nutch.util.URLUtil;
import org.apache.nutch.util.domain.DomainSuffix;
/**
- * <p>Filters URLs based on a file containing domain suffixes, domain names, and
- * hostnames. A url that matches one of the suffixes, domains, or hosts
- * present in the file is filtered out.</p>
- *
- * <p>Urls are checked in order of domain suffix, domain name, and hostname
- * against entries in the domain file. The domain file would be setup as follows
- * with one entry per line:
- *
- * <pre> com apache.org www.apache.org </pre>
- *
- * <p>The first line is an example of a filter that would allow all .com
- * domains. The second line allows all urls from apache.org and all of its
- * subdomains such as lucene.apache.org and hadoop.apache.org. The third line
- * would allow only urls from www.apache.org. There is no specific ordering to
- * entries. The entries are from more general to more specific with the more
- * general overridding the more specific.</p>
- *
- * The domain file defaults to domainblacklist-urlfilter.txt in the classpath but can be
- * overridden using the:
- *
- * <ul> <ol>property "urlfilter.domainblacklist.file" in ./conf/nutch-*.xml, and</ol>
- * <ol>attribute "file" in plugin.xml of this plugin</ol> </ul>
+ * <p>
+ * Filters URLs based on a file containing domain suffixes, domain names, and
+ * hostnames. A url that matches one of the suffixes, domains, or hosts present
+ * in the file is filtered out.
+ * </p>
+ *
+ * <p>
+ * Urls are checked in order of domain suffix, domain name, and hostname against
+ * entries in the domain file. The domain file would be setup as follows with
+ * one entry per line:
+ *
+ * <pre>
+ * com apache.org www.apache.org
+ * </pre>
+ *
+ * <p>
+ * The first line is an example of a filter that would allow all .com domains.
+ * The second line allows all urls from apache.org and all of its subdomains
+ * such as lucene.apache.org and hadoop.apache.org. The third line would allow
+ * only urls from www.apache.org. There is no specific ordering to entries. The
+ * entries are from more general to more specific with the more general
+ * overridding the more specific.
+ * </p>
+ *
+ * The domain file defaults to domainblacklist-urlfilter.txt in the classpath
+ * but can be overridden using the:
+ *
+ * <ul>
+ * <ol>
+ * property "urlfilter.domainblacklist.file" in ./conf/nutch-*.xml, and
+ * </ol>
+ * <ol>
+ * attribute "file" in plugin.xml of this plugin
+ * </ol>
+ * </ul>
*
* the attribute "file" has higher precedence if defined.
*/
-public class DomainBlacklistURLFilter
- implements URLFilter {
+public class DomainBlacklistURLFilter implements URLFilter {
- private static final Logger LOG = LoggerFactory.getLogger(DomainBlacklistURLFilter.class);
+ private static final Logger LOG = LoggerFactory
+ .getLogger(DomainBlacklistURLFilter.class);
// read in attribute "file" of this plugin.
private static String attributeFile = null;
@@ -71,8 +84,7 @@ public class DomainBlacklistURLFilter
private String domainFile = null;
private Set<String> domainSet = new LinkedHashSet<String>();
- private void readConfiguration(Reader configReader)
- throws IOException {
+ private void readConfiguration(Reader configReader) throws IOException {
// read the configuration file, line by line
BufferedReader reader = new BufferedReader(configReader);
@@ -95,7 +107,8 @@ public class DomainBlacklistURLFilter
/**
* Constructor that specifies the domain file to use.
*
- * @param domainFile The domain file, overrides domainblacklist-urlfilter.text default.
+ * @param domainFile
+ * The domain file, overrides domainblacklist-urlfilter.text default.
*
* @throws IOException
*/
@@ -111,8 +124,8 @@ public class DomainBlacklistURLFilter
// get the extensions for domain urlfilter
String pluginName = "urlfilter-domainblacklist";
- Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
- URLFilter.class.getName()).getExtensions();
+ Extension[] extensions = PluginRepository.get(conf)
+ .getExtensionPoint(URLFilter.class.getName()).getExtensions();
for (int i = 0; i < extensions.length; i++) {
Extension extension = extensions[i];
if (extension.getDescriptor().getPluginId().equals(pluginName)) {
@@ -129,23 +142,21 @@ public class DomainBlacklistURLFilter
if (attributeFile != null) {
if (LOG.isInfoEnabled()) {
LOG.info("Attribute \"file\" is defined for plugin " + pluginName
- + " as " + attributeFile);
+ + " as " + attributeFile);
}
- }
- else {
+ } else {
if (LOG.isWarnEnabled()) {
LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
- + pluginName);
+ + pluginName);
}
}
// domain file and attribute "file" take precedence if defined
- String file = conf.get("urlfilter.domainblacklist.file");
+ String file = conf.get("urlfilter.domainblacklist.file");
String stringRules = conf.get("urlfilter.domainblacklist.rules");
if (domainFile != null) {
file = domainFile;
- }
- else if (attributeFile != null) {
+ } else if (attributeFile != null) {
file = attributeFile;
}
Reader reader = null;
@@ -159,8 +170,7 @@ public class DomainBlacklistURLFilter
reader = new FileReader(file);
}
readConfiguration(reader);
- }
- catch (IOException e) {
+ } catch (IOException e) {
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
}
@@ -171,7 +181,7 @@ public class DomainBlacklistURLFilter
public String filter(String url) {
try {
- // match for suffix, domain, and host in that order. more general will
+ // match for suffix, domain, and host in that order. more general will
// override more specific
String domain = URLUtil.getDomainName(url).toLowerCase().trim();
String host = URLUtil.getHost(url);
@@ -182,19 +192,18 @@ public class DomainBlacklistURLFilter
}
if (domainSet.contains(suffix) || domainSet.contains(domain)
- || domainSet.contains(host)) {
+ || domainSet.contains(host)) {
// Matches, filter!
return null;
}
// doesn't match, allow
return url;
- }
- catch (Exception e) {
+ } catch (Exception e) {
// if an error happens, allow the url to pass
LOG.error("Could not apply filter on url: " + url + "\n"
- + org.apache.hadoop.util.StringUtils.stringifyException(e));
+ + org.apache.hadoop.util.StringUtils.stringifyException(e));
return null;
}
}
Modified: nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java Thu Jan 29 05:38:59 2015
@@ -27,12 +27,12 @@ public class TestDomainBlacklistURLFilte
private final static String SAMPLES = System.getProperty("test.data", ".");
@Test
- public void testFilter()
- throws Exception {
+ public void testFilter() throws Exception {
String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt";
Configuration conf = NutchConfiguration.create();
- DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter(domainBlacklistFile);
+ DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter(
+ domainBlacklistFile);
domainBlacklistFilter.setConf(conf);
Assert.assertNull(domainBlacklistFilter.filter("http://lucene.apache.org"));
Assert.assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org"));
Modified: nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java Thu Jan 29 05:38:59 2015
@@ -39,16 +39,19 @@ import java.util.List;
import java.util.ArrayList;
/**
- * Filters URLs based on a file of URL prefixes. The file is named by
- * (1) property "urlfilter.prefix.file" in ./conf/nutch-default.xml, and
- * (2) attribute "file" in plugin.xml of this plugin
- * Attribute "file" has higher precedence if defined.
- *
- * <p>The format of this file is one URL prefix per line.</p>
+ * Filters URLs based on a file of URL prefixes. The file is named by (1)
+ * property "urlfilter.prefix.file" in ./conf/nutch-default.xml, and (2)
+ * attribute "file" in plugin.xml of this plugin Attribute "file" has higher
+ * precedence if defined.
+ *
+ * <p>
+ * The format of this file is one URL prefix per line.
+ * </p>
*/
public class PrefixURLFilter implements URLFilter {
- private static final Logger LOG = LoggerFactory.getLogger(PrefixURLFilter.class);
+ private static final Logger LOG = LoggerFactory
+ .getLogger(PrefixURLFilter.class);
// read in attribute "file" of this plugin.
private static String attributeFile = null;
@@ -58,7 +61,7 @@ public class PrefixURLFilter implements
private Configuration conf;
public PrefixURLFilter() throws IOException {
-
+
}
public PrefixURLFilter(String stringRules) throws IOException {
@@ -72,43 +75,43 @@ public class PrefixURLFilter implements
return url;
}
- private TrieStringMatcher readConfiguration(Reader reader)
- throws IOException {
-
- BufferedReader in=new BufferedReader(reader);
+ private TrieStringMatcher readConfiguration(Reader reader) throws IOException {
+
+ BufferedReader in = new BufferedReader(reader);
List<String> urlprefixes = new ArrayList<String>();
String line;
- while((line=in.readLine())!=null) {
+ while ((line = in.readLine()) != null) {
if (line.length() == 0)
continue;
- char first=line.charAt(0);
+ char first = line.charAt(0);
switch (first) {
- case ' ' : case '\n' : case '#' : // skip blank & comment lines
+ case ' ':
+ case '\n':
+ case '#': // skip blank & comment lines
continue;
- default :
- urlprefixes.add(line);
+ default:
+ urlprefixes.add(line);
}
}
return new PrefixStringMatcher(urlprefixes);
}
- public static void main(String args[])
- throws IOException {
-
+ public static void main(String args[]) throws IOException {
+
PrefixURLFilter filter;
if (args.length >= 1)
filter = new PrefixURLFilter(args[0]);
else
filter = new PrefixURLFilter();
-
- BufferedReader in=new BufferedReader(new InputStreamReader(System.in));
+
+ BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
String line;
- while((line=in.readLine())!=null) {
- String out=filter.filter(line);
- if(out!=null) {
+ while ((line = in.readLine()) != null) {
+ String out = filter.filter(line);
+ if (out != null) {
System.out.println(out);
}
}
@@ -118,8 +121,8 @@ public class PrefixURLFilter implements
this.conf = conf;
String pluginName = "urlfilter-prefix";
- Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(
- URLFilter.class.getName()).getExtensions();
+ Extension[] extensions = PluginRepository.get(conf)
+ .getExtensionPoint(URLFilter.class.getName()).getExtensions();
for (int i = 0; i < extensions.length; i++) {
Extension extension = extensions[i];
if (extension.getDescriptor().getPluginId().equals(pluginName)) {
@@ -136,8 +139,8 @@ public class PrefixURLFilter implements
}
} else {
// if (LOG.isWarnEnabled()) {
- // LOG.warn("Attribute \"file\" is not defined in plugin.xml for
- // plugin "+pluginName);
+ // LOG.warn("Attribute \"file\" is not defined in plugin.xml for
+ // plugin "+pluginName);
// }
}
@@ -159,7 +162,9 @@ public class PrefixURLFilter implements
try {
trie = readConfiguration(reader);
} catch (IOException e) {
- if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); }
+ if (LOG.isErrorEnabled()) {
+ LOG.error(e.getMessage());
+ }
// TODO mb@media-style.com: throw Exception? Because broken api.
throw new RuntimeException(e.getMessage(), e);
}
@@ -169,5 +174,5 @@ public class PrefixURLFilter implements
public Configuration getConf() {
return this.conf;
}
-
+
}
Modified: nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-regex/src/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java Thu Jan 29 05:38:59 2015
@@ -28,13 +28,12 @@ import org.apache.nutch.urlfilter.api.Re
import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
import org.apache.nutch.util.NutchConfiguration;
-
/**
* Filters URLs based on a file of regular expressions using the
* {@link java.util.regex Java Regex implementation}.
*/
public class RegexURLFilter extends RegexURLFilterBase {
-
+
public static final String URLFILTER_REGEX_FILE = "urlfilter.regex.file";
public static final String URLFILTER_REGEX_RULES = "urlfilter.regex.rules";
@@ -42,24 +41,23 @@ public class RegexURLFilter extends Rege
super();
}
- public RegexURLFilter(String filename)
- throws IOException, PatternSyntaxException {
+ public RegexURLFilter(String filename) throws IOException,
+ PatternSyntaxException {
super(filename);
}
- RegexURLFilter(Reader reader)
- throws IOException, IllegalArgumentException {
+ RegexURLFilter(Reader reader) throws IOException, IllegalArgumentException {
super(reader);
}
-
- /* ----------------------------------- *
- * <implementation:RegexURLFilterBase> *
- * ----------------------------------- */
-
+ /*
+ * ----------------------------------- * <implementation:RegexURLFilterBase> *
+ * -----------------------------------
+ */
+
/**
- * Rules specified as a config property will override rules specified
- * as a config file.
+ * Rules specified as a config property will override rules specified as a
+ * config file.
*/
protected Reader getRulesReader(Configuration conf) throws IOException {
String stringRules = conf.get(URLFILTER_REGEX_RULES);
@@ -74,23 +72,22 @@ public class RegexURLFilter extends Rege
protected RegexRule createRule(boolean sign, String regex) {
return new Rule(sign, regex);
}
-
- /* ------------------------------------ *
- * </implementation:RegexURLFilterBase> *
- * ------------------------------------ */
-
+ /*
+ * ------------------------------------ * </implementation:RegexURLFilterBase>
+ * * ------------------------------------
+ */
+
public static void main(String args[]) throws IOException {
RegexURLFilter filter = new RegexURLFilter();
filter.setConf(NutchConfiguration.create());
main(filter, args);
}
-
private class Rule extends RegexRule {
-
+
private Pattern pattern;
-
+
Rule(boolean sign, String regex) {
super(sign, regex);
pattern = Pattern.compile(regex);
@@ -100,5 +97,5 @@ public class RegexURLFilter extends Rege
return pattern.matcher(url).find();
}
}
-
+
}
Modified: nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java Thu Jan 29 05:38:59 2015
@@ -26,15 +26,13 @@ import org.apache.nutch.urlfilter.api.Re
import org.junit.Assert;
import org.junit.Test;
-
/**
* JUnit based test of class <code>RegexURLFilter</code>.
- *
+ *
* @author Jérôme Charron
*/
public class TestRegexURLFilter extends RegexURLFilterBaseTest {
-
protected URLFilter getURLFilter(Reader rules) {
try {
return new RegexURLFilter(rules);
@@ -43,7 +41,7 @@ public class TestRegexURLFilter extends
return null;
}
}
-
+
@Test
public void test() {
test("WholeWebCrawling");
Modified: nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java Thu Jan 29 05:38:59 2015
@@ -51,14 +51,15 @@ import java.net.MalformedURLException;
* Attribute "file" has higher precedence if defined. If the config file is
* missing, all URLs will be rejected.
*
- * <p>This filter can be configured to work in one of two modes:
+ * <p>
+ * This filter can be configured to work in one of two modes:
* <ul>
- * <li><b>default to reject</b> ('-'): in this mode, only URLs that match suffixes
- * specified in the config file will be accepted, all other URLs will be
- * rejected.</li>
- * <li><b>default to accept</b> ('+'): in this mode, only URLs that match suffixes
- * specified in the config file will be rejected, all other URLs will be
- * accepted.</li>
+ * <li><b>default to reject</b> ('-'): in this mode, only URLs that match
+ * suffixes specified in the config file will be accepted, all other URLs will
+ * be rejected.</li>
+ * <li><b>default to accept</b> ('+'): in this mode, only URLs that match
+ * suffixes specified in the config file will be rejected, all other URLs will
+ * be accepted.</li>
* </ul>
* <p>
* The format of this config file is one URL suffix per line, with no preceding
@@ -67,10 +68,10 @@ import java.net.MalformedURLException;
* </p>
* <p>
* A single '+' or '-' sign not followed by any suffix must be used once, to
- * signify the mode this plugin operates in. An optional single 'I' can be appended,
- * to signify that suffix matches should be case-insensitive. The default, if
- * not specified, is to use case-sensitive matches, i.e. suffix '.JPG'
- * does not match '.jpg'.
+ * signify the mode this plugin operates in. An optional single 'I' can be
+ * appended, to signify that suffix matches should be case-insensitive. The
+ * default, if not specified, is to use case-sensitive matches, i.e. suffix
+ * '.JPG' does not match '.jpg'.
* </p>
* <p>
* NOTE: the format of this file is different from urlfilter-prefix, because
@@ -82,8 +83,8 @@ import java.net.MalformedURLException;
* <h4>Example 1</h4>
* <p>
* The configuration shown below will accept all URLs with '.html' or '.htm'
- * suffixes (case-sensitive - '.HTML' or '.HTM' will be rejected),
- * and prohibit all other suffixes.
+ * suffixes (case-sensitive - '.HTML' or '.HTM' will be rejected), and prohibit
+ * all other suffixes.
* <p>
*
* <pre>
@@ -91,7 +92,7 @@ import java.net.MalformedURLException;
*
* # prohibit all unknown, case-sensitive matching
* -
- *
+ *
* # collect only HTML files.
* .html
* .htm
@@ -119,11 +120,13 @@ import java.net.MalformedURLException;
* </pre>
*
* </p>
+ *
* @author Andrzej Bialecki
*/
public class SuffixURLFilter implements URLFilter {
- private static final Logger LOG = LoggerFactory.getLogger(SuffixURLFilter.class);
+ private static final Logger LOG = LoggerFactory
+ .getLogger(SuffixURLFilter.class);
// read in attribute "file" of this plugin.
private String attributeFile = null;
@@ -144,11 +147,13 @@ public class SuffixURLFilter implements
}
public String filter(String url) {
- if (url == null) return null;
+ if (url == null)
+ return null;
String _url;
if (ignoreCase)
_url = url.toLowerCase();
- else _url = url;
+ else
+ _url = url;
if (filterFromPath) {
try {
URL pUrl = new URL(_url);
@@ -160,11 +165,15 @@ public class SuffixURLFilter implements
String a = suffixes.shortestMatch(_url);
if (a == null) {
- if (modeAccept) return url;
- else return null;
+ if (modeAccept)
+ return url;
+ else
+ return null;
} else {
- if (modeAccept) return null;
- else return url;
+ if (modeAccept)
+ return null;
+ else
+ return url;
}
}
@@ -187,30 +196,31 @@ public class SuffixURLFilter implements
String line;
while ((line = in.readLine()) != null) {
- if (line.length() == 0) continue;
+ if (line.length() == 0)
+ continue;
char first = line.charAt(0);
switch (first) {
- case ' ':
- case '\n':
- case '#': // skip blank & comment lines
- break;
- case '-':
- allow = false;
- if(line.contains("P"))
- filterFromPath = true;
- if(line.contains("I"))
- ignore = true;
- break;
- case '+':
- allow = true;
- if(line.contains("P"))
- filterFromPath = true;
- if(line.contains("I"))
- ignore = true;
- break;
- default:
- aSuffixes.add(line);
+ case ' ':
+ case '\n':
+ case '#': // skip blank & comment lines
+ break;
+ case '-':
+ allow = false;
+ if (line.contains("P"))
+ filterFromPath = true;
+ if (line.contains("I"))
+ ignore = true;
+ break;
+ case '+':
+ allow = true;
+ if (line.contains("P"))
+ filterFromPath = true;
+ if (line.contains("I"))
+ ignore = true;
+ break;
+ default:
+ aSuffixes.add(line);
}
}
if (ignore) {
@@ -249,7 +259,8 @@ public class SuffixURLFilter implements
this.conf = conf;
String pluginName = "urlfilter-suffix";
- Extension[] extensions = PluginRepository.get(conf).getExtensionPoint(URLFilter.class.getName()).getExtensions();
+ Extension[] extensions = PluginRepository.get(conf)
+ .getExtensionPoint(URLFilter.class.getName()).getExtensions();
for (int i = 0; i < extensions.length; i++) {
Extension extension = extensions[i];
if (extension.getDescriptor().getPluginId().equals(pluginName)) {
@@ -257,22 +268,25 @@ public class SuffixURLFilter implements
break;
}
}
- if (attributeFile != null && attributeFile.trim().equals("")) attributeFile = null;
+ if (attributeFile != null && attributeFile.trim().equals(""))
+ attributeFile = null;
if (attributeFile != null) {
if (LOG.isInfoEnabled()) {
- LOG.info("Attribute \"file\" is defined for plugin " + pluginName + " as " + attributeFile);
+ LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+ + " as " + attributeFile);
}
} else {
// if (LOG.isWarnEnabled()) {
- // LOG.warn("Attribute \"file\" is not defined in plugin.xml for
- // plugin "+pluginName);
+ // LOG.warn("Attribute \"file\" is not defined in plugin.xml for
+ // plugin "+pluginName);
// }
}
String file = conf.get("urlfilter.suffix.file");
String stringRules = conf.get("urlfilter.suffix.rules");
// attribute "file" takes precedence if defined
- if (attributeFile != null) file = attributeFile;
+ if (attributeFile != null)
+ file = attributeFile;
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
@@ -283,7 +297,9 @@ public class SuffixURLFilter implements
try {
readConfiguration(reader);
} catch (IOException e) {
- if (LOG.isErrorEnabled()) { LOG.error(e.getMessage()); }
+ if (LOG.isErrorEnabled()) {
+ LOG.error(e.getMessage());
+ }
throw new RuntimeException(e.getMessage(), e);
}
}
Modified: nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java (original)
+++ nutch/trunk/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java Thu Jan 29 05:38:59 2015
@@ -25,101 +25,45 @@ import org.junit.Test;
/**
* JUnit test for <code>SuffixURLFilter</code>.
- *
+ *
* @author Andrzej Bialecki
*/
public class TestSuffixURLFilter {
- private static final String suffixes =
- "# this is a comment\n" +
- "\n" +
- ".gif\n" +
- ".jpg\n" +
- ".js\n";
-
+ private static final String suffixes = "# this is a comment\n" + "\n"
+ + ".gif\n" + ".jpg\n" + ".js\n";
+
private static final String[] urls = new String[] {
- "http://www.example.com/test.gif",
- "http://www.example.com/TEST.GIF",
- "http://www.example.com/test.jpg",
- "http://www.example.com/test.JPG",
- "http://www.example.com/test.html",
- "http://www.example.com/test.HTML",
- "http://www.example.com/test.html?q=abc.js",
- "http://www.example.com/test.js?foo=bar&baz=bar#12333",
- };
-
- private static String[] urlsModeAccept = new String[] {
- null,
- urls[1],
- null,
- urls[3],
- urls[4],
- urls[5],
- null,
- urls[7]
- };
-
- private static String[] urlsModeReject = new String[] {
- urls[0],
- null,
- urls[2],
- null,
- null,
- null,
- urls[6],
- null
- };
-
- private static String[] urlsModeAcceptIgnoreCase = new String[] {
- null,
- null,
- null,
- null,
- urls[4],
- urls[5],
- null,
- urls[7]
- };
-
- private static String[] urlsModeRejectIgnoreCase = new String[] {
- urls[0],
- urls[1],
- urls[2],
- urls[3],
- null,
- null,
- urls[6],
- null
- };
-
- private static String[] urlsModeAcceptAndPathFilter = new String[] {
- null,
- urls[1],
- null,
- urls[3],
- urls[4],
- urls[5],
- urls[6],
- null
- };
-
- private static String[] urlsModeAcceptAndNonPathFilter = new String[] {
- null,
- urls[1],
- null,
- urls[3],
- urls[4],
- urls[5],
- null,
- urls[7]
- };
-
+ "http://www.example.com/test.gif", "http://www.example.com/TEST.GIF",
+ "http://www.example.com/test.jpg", "http://www.example.com/test.JPG",
+ "http://www.example.com/test.html", "http://www.example.com/test.HTML",
+ "http://www.example.com/test.html?q=abc.js",
+ "http://www.example.com/test.js?foo=bar&baz=bar#12333", };
+
+ private static String[] urlsModeAccept = new String[] { null, urls[1], null,
+ urls[3], urls[4], urls[5], null, urls[7] };
+
+ private static String[] urlsModeReject = new String[] { urls[0], null,
+ urls[2], null, null, null, urls[6], null };
+
+ private static String[] urlsModeAcceptIgnoreCase = new String[] { null, null,
+ null, null, urls[4], urls[5], null, urls[7] };
+
+ private static String[] urlsModeRejectIgnoreCase = new String[] { urls[0],
+ urls[1], urls[2], urls[3], null, null, urls[6], null };
+
+ private static String[] urlsModeAcceptAndPathFilter = new String[] { null,
+ urls[1], null, urls[3], urls[4], urls[5], urls[6], null };
+
+ private static String[] urlsModeAcceptAndNonPathFilter = new String[] { null,
+ urls[1], null, urls[3], urls[4], urls[5], null, urls[7] };
+
private SuffixURLFilter filter = null;
-
+
@Before
public void setUp() throws IOException {
filter = new SuffixURLFilter(new StringReader(suffixes));
}
-
+
@Test
public void testModeAccept() {
filter.setIgnoreCase(false);
@@ -155,22 +99,24 @@ public class TestSuffixURLFilter {
Assert.assertTrue(urlsModeRejectIgnoreCase[i] == filter.filter(urls[i]));
}
}
-
+
@Test
public void testModeAcceptAndNonPathFilter() {
filter.setModeAccept(true);
filter.setFilterFromPath(false);
for (int i = 0; i < urls.length; i++) {
- Assert.assertTrue(urlsModeAcceptAndNonPathFilter[i] == filter.filter(urls[i]));
+ Assert.assertTrue(urlsModeAcceptAndNonPathFilter[i] == filter
+ .filter(urls[i]));
}
}
-
+
@Test
public void testModeAcceptAndPathFilter() {
filter.setModeAccept(true);
filter.setFilterFromPath(true);
for (int i = 0; i < urls.length; i++) {
- Assert.assertTrue(urlsModeAcceptAndPathFilter[i] == filter.filter(urls[i]));
+ Assert.assertTrue(urlsModeAcceptAndPathFilter[i] == filter
+ .filter(urls[i]));
}
}
Modified: nutch/trunk/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java (original)
+++ nutch/trunk/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java Thu Jan 29 05:38:59 2015
@@ -23,12 +23,16 @@ import org.apache.hadoop.conf.Configurat
import org.apache.nutch.net.URLFilter;
/**
- * <p>Validates URLs.</p>
- *
- * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b,
- * Date: 03/07/02,
- * http://javascript.internet.com. However, this validation now bears little
- * resemblance to the php original.</p>
+ * <p>
+ * Validates URLs.
+ * </p>
+ *
+ * <p>
+ * Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date:
+ * 03/07/02, http://javascript.internet.com. However, this validation now bears
+ * little resemblance to the php original.
+ * </p>
+ *
* <pre>
* Example of usage:
* UrlValidator urlValidator = UrlValidator.get();
@@ -37,17 +41,17 @@ import org.apache.nutch.net.URLFilter;
* } else {
* System.out.println("url is invalid");
* }
- *
+ *
* prints out "url is valid"
- * </pre>
- *
- * <p>Based on UrlValidator code from Apache commons-validator.</p>
- *
- * @see
- * <a href='http://www.ietf.org/rfc/rfc2396.txt' >
- * Uniform Resource Identifiers (URI): Generic Syntax
- * </a>
- *
+ * </pre>
+ *
+ * <p>
+ * Based on UrlValidator code from Apache commons-validator.
+ * </p>
+ *
+ * @see <a href='http://www.ietf.org/rfc/rfc2396.txt' > Uniform Resource
+ * Identifiers (URI): Generic Syntax </a>
+ *
*/
public class UrlValidator implements URLFilter {
@@ -61,7 +65,7 @@ public class UrlValidator implements URL
private static final String SCHEME_CHARS = ALPHA_CHARS;
- // Drop numeric, and "+-." for now
+ // Drop numeric, and "+-." for now
private static final String AUTHORITY_CHARS = ALPHA_NUMERIC_CHARS + "\\-\\.";
private static final String ATOM = VALID_CHARS + '+';
@@ -69,9 +73,9 @@ public class UrlValidator implements URL
/**
* This expression derived/taken from the BNF for URI (RFC2396).
*/
- private static final Pattern URL_PATTERN =
- Pattern.compile("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)" +
- "(\\?([^#]*))?(#(.*))?");
+ private static final Pattern URL_PATTERN = Pattern
+ .compile("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)"
+ + "(\\?([^#]*))?(#(.*))?");
/**
* Schema/Protocol (ie. http:, ftp:, file:, etc).
@@ -90,11 +94,11 @@ public class UrlValidator implements URL
/**
* Protocol (ie. http:, ftp:,https:).
*/
- private static final Pattern SCHEME_PATTERN =
- Pattern.compile("^[" + SCHEME_CHARS + "]+");
+ private static final Pattern SCHEME_PATTERN = Pattern.compile("^["
+ + SCHEME_CHARS + "]+");
- private static final Pattern AUTHORITY_PATTERN =
- Pattern.compile("^([" + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?");
+ private static final Pattern AUTHORITY_PATTERN = Pattern.compile("^(["
+ + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?");
private static final int PARSE_AUTHORITY_HOST_IP = 1;
@@ -105,28 +109,26 @@ public class UrlValidator implements URL
*/
private static final int PARSE_AUTHORITY_EXTRA = 3;
- private static final Pattern PATH_PATTERN =
- Pattern.compile("^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$");
+ private static final Pattern PATH_PATTERN = Pattern
+ .compile("^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$");
private static final Pattern QUERY_PATTERN = Pattern.compile("^(.*)$");
- private static final Pattern LEGAL_ASCII_PATTERN =
- Pattern.compile("^[\\x21-\\x7E]+$");
+ private static final Pattern LEGAL_ASCII_PATTERN = Pattern
+ .compile("^[\\x21-\\x7E]+$");
- private static final Pattern IP_V4_DOMAIN_PATTERN =
- Pattern.compile("^(\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})$");
+ private static final Pattern IP_V4_DOMAIN_PATTERN = Pattern
+ .compile("^(\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})$");
- private static final Pattern DOMAIN_PATTERN =
- Pattern.compile("^" + ATOM + "(\\." + ATOM + ")*$");
+ private static final Pattern DOMAIN_PATTERN = Pattern.compile("^" + ATOM
+ + "(\\." + ATOM + ")*$");
- private static final Pattern PORT_PATTERN =
- Pattern.compile("^:(\\d{1,5})$");
+ private static final Pattern PORT_PATTERN = Pattern.compile("^:(\\d{1,5})$");
- private static final Pattern ATOM_PATTERN =
- Pattern.compile("(" + ATOM + ")");
+ private static final Pattern ATOM_PATTERN = Pattern.compile("(" + ATOM + ")");
- private static final Pattern ALPHA_PATTERN =
- Pattern.compile("^[" + ALPHA_CHARS + "]");
+ private static final Pattern ALPHA_PATTERN = Pattern.compile("^["
+ + ALPHA_CHARS + "]");
private Configuration conf;
@@ -143,10 +145,13 @@ public class UrlValidator implements URL
}
/**
- * <p>Checks if a field has a valid url address.</p>
- *
- * @param value The value validation is being performed on.
- * A <code>null</code> value is considered invalid.
+ * <p>
+ * Checks if a field has a valid url address.
+ * </p>
+ *
+ * @param value
+ * The value validation is being performed on. A <code>null</code>
+ * value is considered invalid.
* @return true if the url is valid.
*/
private boolean isValid(String value) {
@@ -184,11 +189,13 @@ public class UrlValidator implements URL
}
/**
- * Validate scheme. If schemes[] was initialized to a non null,
- * then only those scheme's are allowed. Note this is slightly different
- * than for the constructor.
- * @param scheme The scheme to validate. A <code>null</code> value is
- * considered invalid.
+ * Validate scheme. If schemes[] was initialized to a non null, then only
+ * those scheme's are allowed. Note this is slightly different than for the
+ * constructor.
+ *
+ * @param scheme
+ * The scheme to validate. A <code>null</code> value is considered
+ * invalid.
* @return true if valid.
*/
private boolean isValidScheme(String scheme) {
@@ -200,10 +207,12 @@ public class UrlValidator implements URL
}
/**
- * Returns true if the authority is properly formatted. An authority is
- * the combination of hostname and port. A <code>null</code> authority
- * value is considered invalid.
- * @param authority Authority value to validate.
+ * Returns true if the authority is properly formatted. An authority is the
+ * combination of hostname and port. A <code>null</code> authority value is
+ * considered invalid.
+ *
+ * @param authority
+ * Authority value to validate.
* @return true if authority (hostname and port) is valid.
*/
private boolean isValidAuthority(String authority) {
@@ -235,7 +244,7 @@ public class UrlValidator implements URL
if (Integer.parseInt(ipSegment) > 255) {
return false;
}
- } catch(NumberFormatException e) {
+ } catch (NumberFormatException e) {
return false;
}
@@ -251,8 +260,8 @@ public class UrlValidator implements URL
// TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203
char[] chars = hostIP.toCharArray();
int size = 1;
- for(int i=0; i<chars.length; i++) {
- if(chars[i] == '.') {
+ for (int i = 0; i < chars.length; i++) {
+ if (chars[i] == '.') {
size++;
}
}
@@ -264,8 +273,7 @@ public class UrlValidator implements URL
while (atomMatcher.find()) {
domainSegment[segCount] = atomMatcher.group();
segLen = domainSegment[segCount].length() + 1;
- hostIP = (segLen >= hostIP.length()) ? ""
- : hostIP.substring(segLen);
+ hostIP = (segLen >= hostIP.length()) ? "" : hostIP.substring(segLen);
segCount++;
}
String topLevel = domainSegment[segCount - 1];
@@ -300,10 +308,13 @@ public class UrlValidator implements URL
}
/**
- * <p>Checks if the field isn't null and length of the field is greater
- * than zero not including whitespace.</p>
- *
- * @param value The value validation is being performed on.
+ * <p>
+ * Checks if the field isn't null and length of the field is greater than zero
+ * not including whitespace.
+ * </p>
+ *
+ * @param value
+ * The value validation is being performed on.
* @return true if blank or null.
*/
private boolean isBlankOrNull(String value) {
@@ -311,9 +322,11 @@ public class UrlValidator implements URL
}
/**
- * Returns true if the path is valid. A <code>null</code> value is
- * considered invalid.
- * @param path Path value to validate.
+ * Returns true if the path is valid. A <code>null</code> value is considered
+ * invalid.
+ *
+ * @param path
+ * Path value to validate.
* @return true if path is valid.
*/
private boolean isValidPath(String path) {
@@ -335,7 +348,9 @@ public class UrlValidator implements URL
/**
* Returns true if the query is null or it's a properly formatted query
* string.
- * @param query Query value to validate.
+ *
+ * @param query
+ * Query value to validate.
* @return true if query is valid.
*/
private boolean isValidQuery(String query) {
@@ -348,8 +363,11 @@ public class UrlValidator implements URL
/**
* Returns the number of times the token appears in the target.
- * @param token Token value to be counted.
- * @param target Target value to count tokens in.
+ *
+ * @param token
+ * Token value to be counted.
+ * @param target
+ * Target value to count tokens in.
* @return the number of tokens.
*/
private int countToken(String token, String target) {
Modified: nutch/trunk/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java (original)
+++ nutch/trunk/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java Thu Jan 29 05:38:59 2015
@@ -21,40 +21,59 @@ import org.junit.Assert;
import org.junit.Test;
/**
- * JUnit test case which tests
- * 1. that valid urls are not filtered while invalid ones are filtered.
- * 2. that Urls' scheme, authority, path and query are validated.
+ * JUnit test case which tests 1. that valid urls are not filtered while invalid
+ * ones are filtered. 2. that Urls' scheme, authority, path and query are
+ * validated.
*
* @author tejasp
- *
+ *
*/
public class TestUrlValidator {
/**
- * Test method for {@link org.apache.nutch.urlfilter.validator.UrlValidator#filter(java.lang.String)}.
+ * Test method for
+ * {@link org.apache.nutch.urlfilter.validator.UrlValidator#filter(java.lang.String)}
+ * .
*/
@Test
public void testFilter() {
UrlValidator url_validator = new UrlValidator();
Assert.assertNotNull(url_validator);
- Assert.assertNull("Filtering on a null object should return null", url_validator.filter(null));
- Assert.assertNull("Invalid url: example.com/file[/].html", url_validator.filter("example.com/file[/].html"));
- Assert.assertNull("Invalid url: http://www.example.com/space here.html", url_validator.filter("http://www.example.com/space here.html"));
- Assert.assertNull("Invalid url: /main.html", url_validator.filter("/main.html"));
- Assert.assertNull("Invalid url: www.example.com/main.html", url_validator.filter("www.example.com/main.html"));
- Assert.assertNull("Invalid url: ftp:www.example.com/main.html", url_validator.filter("ftp:www.example.com/main.html"));
- Assert.assertNull("Inalid url: http://999.000.456.32/nutch/trunk/README.txt",
+ Assert.assertNull("Filtering on a null object should return null",
+ url_validator.filter(null));
+ Assert.assertNull("Invalid url: example.com/file[/].html",
+ url_validator.filter("example.com/file[/].html"));
+ Assert.assertNull("Invalid url: http://www.example.com/space here.html",
+ url_validator.filter("http://www.example.com/space here.html"));
+ Assert.assertNull("Invalid url: /main.html",
+ url_validator.filter("/main.html"));
+ Assert.assertNull("Invalid url: www.example.com/main.html",
+ url_validator.filter("www.example.com/main.html"));
+ Assert.assertNull("Invalid url: ftp:www.example.com/main.html",
+ url_validator.filter("ftp:www.example.com/main.html"));
+ Assert.assertNull(
+ "Inalid url: http://999.000.456.32/nutch/trunk/README.txt",
url_validator.filter("http://999.000.456.32/nutch/trunk/README.txt"));
- Assert.assertNull("Invalid url: http://www.example.com/ma|in\\toc.html", url_validator.filter(" http://www.example.com/ma|in\\toc.html"));
+ Assert.assertNull("Invalid url: http://www.example.com/ma|in\\toc.html",
+ url_validator.filter(" http://www.example.com/ma|in\\toc.html"));
- Assert.assertNotNull("Valid url: https://issues.apache.org/jira/NUTCH-1127", url_validator.filter("https://issues.apache.org/jira/NUTCH-1127"));
- Assert.assertNotNull("Valid url: http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather",
- url_validator.filter("http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather"));
- Assert.assertNotNull("Valid url: http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress",
- url_validator.filter("http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress"));
- Assert.assertNotNull("Valid url: ftp://alfa.bravo.pi/foo/bar/plan.pdf", url_validator.filter("ftp://alfa.bravo.pi/mike/check/plan.pdf"));
+ Assert.assertNotNull(
+ "Valid url: https://issues.apache.org/jira/NUTCH-1127",
+ url_validator.filter("https://issues.apache.org/jira/NUTCH-1127"));
+ Assert
+ .assertNotNull(
+ "Valid url: http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather",
+ url_validator
+ .filter("http://domain.tld/function.cgi?url=http://fonzi.com/&name=Fonzi&mood=happy&coat=leather"));
+ Assert
+ .assertNotNull(
+ "Valid url: http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress",
+ url_validator
+ .filter("http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress"));
+ Assert.assertNotNull("Valid url: ftp://alfa.bravo.pi/foo/bar/plan.pdf",
+ url_validator.filter("ftp://alfa.bravo.pi/mike/check/plan.pdf"));
}
}