You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2016/01/26 20:19:03 UTC
svn commit: r1726853 [1/2] - in /nutch/branches/2.x: ./ conf/ src/gora/
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/net/
src/java/org/apache/nutch/parse/ src/java/org...
Author: lewismc
Date: Tue Jan 26 19:19:02 2016
New Revision: 1726853
URL: http://svn.apache.org/viewvc?rev=1726853&view=rev
Log:
NUTCH-1741 Support of Sitemaps in Nutch 2.x
Added:
nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectType.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/NutchSitemapParse.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/NutchSitemapParser.java
nutch/branches/2.x/src/test/org/apache/nutch/parse/TestSitemapParser.java
nutch/branches/2.x/src/test/org/apache/nutch/util/HelloHandler.java
nutch/branches/2.x/src/testresources/fetch-test-site/sitemap1.xml
nutch/branches/2.x/src/testresources/fetch-test-site/sitemap2.xml
nutch/branches/2.x/src/testresources/fetch-test-site/sitemapIndex.xml
Modified:
nutch/branches/2.x/.gitignore
nutch/branches/2.x/conf/gora-accumulo-mapping.xml
nutch/branches/2.x/conf/gora-cassandra-mapping.xml
nutch/branches/2.x/conf/gora-hbase-mapping.xml
nutch/branches/2.x/conf/gora-mongodb-mapping.xml
nutch/branches/2.x/conf/gora-solr-mapping.xml
nutch/branches/2.x/conf/nutch-default.xml
nutch/branches/2.x/src/gora/webpage.avsc
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java
nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java
nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
nutch/branches/2.x/src/java/org/apache/nutch/metadata/Metadata.java
nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java
nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilters.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/Mark.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/WebPage.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java
nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java
nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java
nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java
nutch/branches/2.x/src/testresources/fetch-test-site/robots.txt
Modified: nutch/branches/2.x/.gitignore
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/.gitignore?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/.gitignore (original)
+++ nutch/branches/2.x/.gitignore Tue Jan 26 19:19:02 2016
@@ -5,3 +5,7 @@ conf/slaves
build/
runtime/
logs/
+*.iml
+.idea
+.log
+
Modified: nutch/branches/2.x/conf/gora-accumulo-mapping.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/gora-accumulo-mapping.xml?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/conf/gora-accumulo-mapping.xml (original)
+++ nutch/branches/2.x/conf/gora-accumulo-mapping.xml Tue Jan 26 19:19:02 2016
@@ -32,6 +32,7 @@
<family name="s" />
<family name="il" />
<family name="ol" />
+ <family name="stm" />
<family name="h" />
<family name="mtdt" />
<family name="mk" />
@@ -53,6 +54,7 @@
<field name="modifiedTime" family="f" qualifier="mod"/>
<field name="prevModifiedTime" family="f" qualifier="pmod"/>
<field name="batchId" family="f" qualifier="bid"/>
+ <field name="sitemaps" family="stm"/>
<!-- parse fields -->
<field name="title" family="p" qualifier="t"/>
@@ -63,11 +65,12 @@
<!-- score fields -->
<field name="score" family="s" qualifier="s"/>
- <field name="headers" family="h"/>
- <field name="inlinks" family="il"/>
- <field name="outlinks" family="ol"/>
- <field name="metadata" family="mtdt"/>
- <field name="markers" family="mk"/>
+ <field name="stmPriority" family="s" qualifier="sp"/>
+ <field name="headers" family="h" qualifier="hea"/>
+ <field name="inlinks" family="il" qualifier="inl"/>
+ <field name="outlinks" family="ol" qualifier="out"/>
+ <field name="metadata" family="mtdt" qualifier="met"/>
+ <field name="markers" family="mk" qualifier="mar"/>
</class>
<table name="host">
@@ -77,9 +80,9 @@
</table>
<class table="host" keyClass="java.lang.String" name="org.apache.nutch.storage.Host">
- <field name="metadata" family="mtdt"/>
- <field name="inlinks" family="il"/>
- <field name="outlinks" family="ol"/>
+ <field name="metadata" family="mtdt" qualifier="met"/>
+ <field name="inlinks" family="il" qualifier="inl"/>
+ <field name="outlinks" family="ol" qualifier="out"/>
</class>
</gora-orm>
Modified: nutch/branches/2.x/conf/gora-cassandra-mapping.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/gora-cassandra-mapping.xml?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/conf/gora-cassandra-mapping.xml (original)
+++ nutch/branches/2.x/conf/gora-cassandra-mapping.xml Tue Jan 26 19:19:02 2016
@@ -58,6 +58,7 @@
<field name="modifiedTime" family="f" qualifier="mod" ttl="0"/>
<field name="prevModifiedTime" family="f" qualifier="pmod" ttl="0"/>
<field name="batchId" family="f" qualifier="bid" ttl="0"/>
+ <field name="sitemaps" family="f" qualifier="stm" ttl="0"/>
<!-- parse fields -->
<field name="title" family="p" qualifier="t" ttl="0"/>
@@ -67,6 +68,7 @@
<!-- score fields -->
<field name="score" family="f" qualifier="s" ttl="0"/>
+ <field name="stmPriority" family="f" qualifier="sp" ttl="0"/>
<!-- super columns -->
<field name="headers" family="sc" qualifier="h" ttl="0"/>
Modified: nutch/branches/2.x/conf/gora-hbase-mapping.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/gora-hbase-mapping.xml?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/conf/gora-hbase-mapping.xml (original)
+++ nutch/branches/2.x/conf/gora-hbase-mapping.xml Tue Jan 26 19:19:02 2016
@@ -46,6 +46,7 @@ http://gora.apache.org/current/gora-hbas
<family name="s" maxVersions="1"/>
<family name="il" maxVersions="1"/>
<family name="ol" maxVersions="1"/>
+ <family name="stm" maxVersions="1"/>
<family name="h" maxVersions="1"/>
<family name="mtdt" maxVersions="1"/>
<family name="mk" maxVersions="1"/>
@@ -66,6 +67,8 @@ http://gora.apache.org/current/gora-hbas
<field name="modifiedTime" family="f" qualifier="mod"/>
<field name="prevModifiedTime" family="f" qualifier="pmod"/>
<field name="batchId" family="f" qualifier="bid"/>
+ <field name="sitemaps" family="stm"/>
+
<!-- parse fields -->
<field name="title" family="p" qualifier="t"/>
@@ -76,6 +79,8 @@ http://gora.apache.org/current/gora-hbas
<!-- score fields -->
<field name="score" family="s" qualifier="s"/>
+ <field name="stmPriority" family="s" qualifier="sp"/>
+
<field name="headers" family="h"/>
<field name="inlinks" family="il"/>
<field name="outlinks" family="ol"/>
Modified: nutch/branches/2.x/conf/gora-mongodb-mapping.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/gora-mongodb-mapping.xml?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/conf/gora-mongodb-mapping.xml (original)
+++ nutch/branches/2.x/conf/gora-mongodb-mapping.xml Tue Jan 26 19:19:02 2016
@@ -35,6 +35,7 @@
<field name="modifiedTime" docfield="modifiedTime" type="int64"/>
<field name="prevModifiedTime" docfield="prevModifiedTime" type="int64"/>
<field name="batchId" docfield="batchId" type="string"/>
+ <field name="sitemaps" docfield="sitemaps" type="document"/>
<!-- parse fields -->
<field name="title" docfield="title" type="string"/>
@@ -43,6 +44,7 @@
<field name="prevSignature" docfield="prevSignature" type="string"/>
<!-- score fields -->
<field name="score" docfield="score" type="int32"/>
+ <field name="stmPriority" family="stmPriority" type="int32"/>
<field name="headers" docfield="headers" type="document"/>
<field name="inlinks" docfield="inlinks" type="document"/>
<field name="outlinks" docfield="outlinks" type="document"/>
Modified: nutch/branches/2.x/conf/gora-solr-mapping.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/gora-solr-mapping.xml?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/conf/gora-solr-mapping.xml (original)
+++ nutch/branches/2.x/conf/gora-solr-mapping.xml Tue Jan 26 19:19:02 2016
@@ -35,6 +35,7 @@
<field name="modifiedTime" column="modifiedTime"/>
<field name="prevModifiedTime" column="prevModifiedTime" />
<field name="batchId" column="batchId" />
+ <field name="sitemaps" family="sitemaps"/>
<!-- parse fields -->
<field name="title" column="title" />
@@ -43,6 +44,7 @@
<field name="prevSignature" column="prevSignature"/>
<!-- score fields -->
<field name="score" column="score"/>
+ <field name="stmPriority" column="stmPriority"/>
<field name="headers" column="headers"/>
<field name="inlinks" column="inlinks" />
<field name="outlinks" column="outlinks"/>
Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Tue Jan 26 19:19:02 2016
@@ -216,6 +216,16 @@
</property>
<property>
+ <name>sitemap.content.limit</name>
+ <value>-1</value>
+ <description>The length limit for downloaded content using the http
+ protocol for sitemap, in bytes. If this value is nonnegative (>=0),
+ content longer than it will be truncated; otherwise, no truncation at all. Do not
+ confuse this setting with the file.content.limit setting.
+ </description>
+</property>
+
+<property>
<name>http.proxy.host</name>
<value></value>
<description>The proxy hostname. If empty, no proxy is used.</description>
@@ -1008,6 +1018,17 @@
Set to -1 to deactivate, bearing in mind that this could cause
the parsing to crash because of a very long or corrupted document.
</description>
+</property>
+
+<property>
+ <name>sitemap.parser.timeout</name>
+ <value>30</value>
+ <description>Timeout in seconds for the parsing of a document, otherwise
+ treats it as an exception and moves on the the following documents.
+ This parameter is applied to Sitemap Parser implementation.
+ Set to -1 to deactivate, bearing in mind that this could cause
+ the parsing to crash because of a very long or corrupted document.
+ </description>
</property>
<property>
Modified: nutch/branches/2.x/src/gora/webpage.avsc
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/gora/webpage.avsc?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/gora/webpage.avsc (original)
+++ nutch/branches/2.x/src/gora/webpage.avsc Tue Jan 26 19:19:02 2016
@@ -278,6 +278,26 @@
],
"doc": "A batchId that this WebPage is assigned to. WebPage's are fetched in batches, called fetchlists. Pages are partitioned but can always be associated and fetched alongside pages of similar value (within a crawl cycle) based on batchId.",
"default": null
+ },
+ {
+ "name": "sitemaps",
+ "type": {
+ "type": "map",
+ "values": [
+ "null",
+ "string"
+ ]
+ },
+ "doc": "Sitemap urls in robot.txt",
+ "default": {
+
+ },
+ {
+ "name": "stmPriority",
+ "type": "float",
+ "doc": "",
+ "default": 0
+ },
}
]
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java Tue Jan 26 19:19:02 2016
@@ -36,7 +36,7 @@ import org.apache.nutch.util.WebPageWrit
import org.apache.gora.mapreduce.GoraMapper;
public class DbUpdateMapper extends
- GoraMapper<String, WebPage, UrlWithScore, NutchWritable> {
+GoraMapper<String, WebPage, UrlWithScore, NutchWritable> {
public static final Logger LOG = DbUpdaterJob.LOG;
private ScoringFilters scoringFilters;
@@ -57,7 +57,7 @@ public class DbUpdateMapper extends
if (Mark.GENERATE_MARK.checkMark(page) == null) {
if (LOG.isDebugEnabled()) {
LOG.debug("Skipping " + TableUtil.unreverseUrl(key)
- + "; not generated yet");
+ + "; not generated yet");
}
return;
}
@@ -66,16 +66,10 @@ public class DbUpdateMapper extends
scoreData.clear();
Map<CharSequence, CharSequence> outlinks = page.getOutlinks();
- if (outlinks != null) {
- for (Entry<CharSequence, CharSequence> e : outlinks.entrySet()) {
- int depth = Integer.MAX_VALUE;
- CharSequence depthUtf8 = page.getMarkers().get(DbUpdaterJob.DISTANCE);
- if (depthUtf8 != null)
- depth = Integer.parseInt(depthUtf8.toString());
- scoreData.add(new ScoreDatum(0.0f, e.getKey().toString(), e.getValue()
- .toString(), depth));
- }
- }
+ addScoreData(page, outlinks);
+
+ Map<CharSequence, CharSequence> sitemaps = page.getSitemaps();
+ addScoreData(page, sitemaps);
// TODO: Outlink filtering (i.e. "only keep the first n outlinks")
try {
@@ -102,6 +96,19 @@ public class DbUpdateMapper extends
}
}
+ private void addScoreData(WebPage page, Map<CharSequence, CharSequence> map) {
+ if (map != null) {
+ for (Entry<CharSequence, CharSequence> e : map.entrySet()) {
+ int depth = Integer.MAX_VALUE;
+ CharSequence depthUtf8 = page.getMarkers().get(DbUpdaterJob.DISTANCE);
+ if (depthUtf8 != null)
+ depth = Integer.parseInt(depthUtf8.toString());
+ scoreData.add(new ScoreDatum(0.0f, e.getKey().toString(), e.getValue()
+ .toString(), depth));
+ }
+ }
+ }
+
@Override
public void setup(Context context) {
scoringFilters = new ScoringFilters(context.getConfiguration());
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java Tue Jan 26 19:19:02 2016
@@ -64,6 +64,7 @@ public class DbUpdaterJob extends NutchT
FIELDS.add(WebPage.Field.PREV_FETCH_TIME);
FIELDS.add(WebPage.Field.PREV_MODIFIED_TIME);
FIELDS.add(WebPage.Field.HEADERS);
+ FIELDS.add(WebPage.Field.SITEMAPS);
}
public static final Utf8 DISTANCE = new Utf8("dist");
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java Tue Jan 26 19:19:02 2016
@@ -51,6 +51,7 @@ public class GeneratorJob extends NutchT
public static final String GENERATOR_MIN_SCORE = "generate.min.score";
public static final String GENERATOR_FILTER = "generate.filter";
public static final String GENERATOR_NORMALISE = "generate.normalise";
+ public static final String GENERATOR_SITEMAP = "generate.sitemap";
public static final String GENERATOR_MAX_COUNT = "generate.max.count";
public static final String GENERATOR_COUNT_MODE = "generate.count.mode";
public static final String GENERATOR_COUNT_VALUE_DOMAIN = "domain";
@@ -75,7 +76,7 @@ public class GeneratorJob extends NutchT
public static final Logger LOG = LoggerFactory.getLogger(GeneratorJob.class);
public static class SelectorEntry implements
- WritableComparable<SelectorEntry> {
+ WritableComparable<SelectorEntry> {
String url;
float score;
@@ -170,7 +171,7 @@ public class GeneratorJob extends NutchT
String batchId = (curTime / 1000) + "-" + randomSeed;
return batchId;
}
-
+
public Map<String, Object> run(Map<String, Object> args) throws Exception {
String batchId = (String) args.get(Nutch.ARG_BATCH);
if (batchId == null) {
@@ -191,12 +192,16 @@ public class GeneratorJob extends NutchT
}
Boolean filter = (Boolean) args.get(Nutch.ARG_FILTER);
Boolean norm = (Boolean) args.get(Nutch.ARG_NORMALIZE);
+ Boolean sitemap = (Boolean) args.get(Nutch.ARG_SITEMAP);
+
// map to inverted subset due for fetch, sort by score
getConf().setLong(GENERATOR_CUR_TIME, curTime);
if (topN != null)
getConf().setLong(GENERATOR_TOP_N, topN);
if (filter != null)
getConf().setBoolean(GENERATOR_FILTER, filter);
+ if (sitemap != null)
+ getConf().setBoolean(GENERATOR_SITEMAP, sitemap);
getConf().setLong(Nutch.GENERATE_TIME_KEY, System.currentTimeMillis());
if (norm != null)
@@ -239,23 +244,26 @@ public class GeneratorJob extends NutchT
* @throws ClassNotFoundException
* @throws InterruptedException
* */
- public String generate(long topN, long curTime, boolean filter, boolean norm)
- throws Exception {
+ public String generate(long topN, long curTime, boolean filter, boolean norm,
+ boolean sitemap) throws Exception {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
- LOG.info("GeneratorJob: starting at " + sdf.format(start));
+ LOG.info("GeneratorJob: starting at {}", sdf.format(start));
LOG.info("GeneratorJob: Selecting best-scoring urls due for fetch.");
LOG.info("GeneratorJob: starting");
- LOG.info("GeneratorJob: filtering: " + filter);
- LOG.info("GeneratorJob: normalizing: " + norm);
+ LOG.info("GeneratorJob: filtering: {}", filter);
+ LOG.info("GeneratorJob: normalizing: {}", norm);
+ if (sitemap) {
+ LOG.info("GeneratorJob: sitemap: {}", sitemap);
+ }
if (topN != Long.MAX_VALUE) {
- LOG.info("GeneratorJob: topN: " + topN);
+ LOG.info("GeneratorJob: topN: {}", topN);
}
String batchId = getConf().get(BATCH_ID);
Map<String, Object> results = run(ToolUtil.toArgMap(Nutch.ARG_TOPN, topN,
Nutch.ARG_CURTIME, curTime, Nutch.ARG_FILTER, filter,
- Nutch.ARG_NORMALIZE, norm, Nutch.ARG_BATCH, batchId));
+ Nutch.ARG_NORMALIZE, norm, Nutch.ARG_BATCH, batchId, Nutch.ARG_SITEMAP, sitemap));
if (batchId == null) {
// use generated random batch id
batchId = (String) results.get(BATCH_ID);
@@ -263,10 +271,10 @@ public class GeneratorJob extends NutchT
long finish = System.currentTimeMillis();
long generateCount = (Long) results.get(GENERATE_COUNT);
- LOG.info("GeneratorJob: finished at " + sdf.format(finish)
- + ", time elapsed: " + TimingUtil.elapsedTime(start, finish));
- LOG.info("GeneratorJob: generated batch id: " + batchId + " containing "
- + generateCount + " URLs");
+ LOG.info("GeneratorJob: finished at {}, time elapsed: {}",
+ sdf.format(finish), TimingUtil.elapsedTime(start, finish));
+ LOG.info("GeneratorJob: generated batch id: {} containing {} URLs",
+ batchId, generateCount);
if (generateCount == 0) {
return null;
}
@@ -276,19 +284,21 @@ public class GeneratorJob extends NutchT
public int run(String[] args) throws Exception {
if (args.length <= 0) {
System.out
- .println("Usage: GeneratorJob [-topN N] [-crawlId id] [-noFilter] [-noNorm] [-adddays numDays]");
+ .println("Usage: GeneratorJob [-topN N] [-crawlId id] [-noFilter] [-noNorm] [-adddays numDays] [-sitemap]");
+ System.out
+ .println(" -topN <N> - number of top URLs to be selected, default is Long.MAX_VALUE ");
System.out
- .println(" -topN <N> - number of top URLs to be selected, default is Long.MAX_VALUE ");
+ .println(" -crawlId <id> - the id to prefix the schemas to operate on, \n \t \t (default: storage.crawl.id)\");");
System.out
- .println(" -crawlId <id> - the id to prefix the schemas to operate on, \n \t \t (default: storage.crawl.id)\");");
+ .println(" -noFilter - do not activate the filter plugin to filter the url, default is true ");
System.out
- .println(" -noFilter - do not activate the filter plugin to filter the url, default is true ");
+ .println(" -noNorm - do not activate the normalizer plugin to normalize the url, default is true ");
System.out
- .println(" -noNorm - do not activate the normalizer plugin to normalize the url, default is true ");
+ .println(" -adddays - Adds numDays to the current time to facilitate crawling urls already");
System.out
- .println(" -adddays - Adds numDays to the current time to facilitate crawling urls already");
+ .println(" -sitemap - generate only sitemap url, default false");
System.out
- .println(" fetched sooner then db.fetch.interval.default. Default value is 0.");
+ .println(" fetched sooner then db.fetch.interval.default. Default value is 0.");
System.out.println(" -batchId - the batch id ");
System.out.println("----------------------");
System.out.println("Please set the params.");
@@ -297,6 +307,7 @@ public class GeneratorJob extends NutchT
long curTime = System.currentTimeMillis(), topN = Long.MAX_VALUE;
boolean filter = true, norm = true;
+ boolean sitemap = false;
for (int i = 0; i < args.length; i++) {
if ("-topN".equals(args[i])) {
@@ -307,6 +318,8 @@ public class GeneratorJob extends NutchT
norm = false;
} else if ("-crawlId".equals(args[i])) {
getConf().set(Nutch.CRAWL_ID_KEY, args[++i]);
+ } else if ("-sitemap".equals(args[i])) {
+ sitemap = true;
} else if ("-adddays".equals(args[i])) {
long numDays = Integer.parseInt(args[++i]);
curTime += numDays * 1000L * 60 * 60 * 24;
@@ -319,7 +332,7 @@ public class GeneratorJob extends NutchT
}
try {
- return (generate(topN, curTime, filter, norm) != null) ? 0 : 1;
+ return (generate(topN, curTime, filter, norm, sitemap) != null) ? 0 : 1;
} catch (Exception e) {
LOG.error("GeneratorJob: " + StringUtils.stringifyException(e));
return -1;
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java Tue Jan 26 19:19:02 2016
@@ -32,12 +32,13 @@ import java.io.IOException;
import java.net.MalformedURLException;
public class GeneratorMapper extends
- GoraMapper<String, WebPage, SelectorEntry, WebPage> {
+GoraMapper<String, WebPage, SelectorEntry, WebPage> {
private URLFilters filters;
private URLNormalizers normalizers;
private boolean filter;
private boolean normalise;
+ private boolean sitemap;
private FetchSchedule schedule;
private ScoringFilters scoringFilters;
private long curTime;
@@ -73,13 +74,16 @@ public class GeneratorMapper extends
}
if (filter && filters.filter(url) == null)
return;
+ if ((sitemap && !URLFilters.isSitemap(page)) || !sitemap && URLFilters
+ .isSitemap(page))
+ return;
} catch (URLFilterException e) {
GeneratorJob.LOG
- .warn("Couldn't filter url: {} ({})", url, e.getMessage());
+ .warn("Couldn't filter url: {} ({})", url, e.getMessage());
return;
} catch (MalformedURLException e) {
GeneratorJob.LOG
- .warn("Couldn't filter url: {} ({})", url, e.getMessage());
+ .warn("Couldn't filter url: {} ({})", url, e.getMessage());
return;
}
@@ -106,6 +110,7 @@ public class GeneratorMapper extends
Configuration conf = context.getConfiguration();
filter = conf.getBoolean(GeneratorJob.GENERATOR_FILTER, true);
normalise = conf.getBoolean(GeneratorJob.GENERATOR_NORMALISE, true);
+ sitemap = conf.getBoolean(GeneratorJob.GENERATOR_SITEMAP, false);
if (filter) {
filters = new URLFilters(conf);
}
Added: nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectType.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectType.java?rev=1726853&view=auto
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectType.java (added)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectType.java Tue Jan 26 19:19:02 2016
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import org.apache.avro.util.Utf8;
+
+public enum InjectType {
+ INJECT("y"),
+ SITEMAP_INJECT("s");
+
+ Utf8 type;
+
+ private InjectType(String type) {
+ this.type = new Utf8(type);
+ }
+
+ public Utf8 getTypeString() {
+ return new Utf8(type);
+ }
+
+}
\ No newline at end of file
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java Tue Jan 26 19:19:02 2016
@@ -65,8 +65,6 @@ public class InjectorJob extends NutchTo
private static final Set<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
- private static final Utf8 YES_STRING = new Utf8("y");
-
static {
FIELDS.add(WebPage.Field.MARKERS);
FIELDS.add(WebPage.Field.STATUS);
@@ -79,8 +77,7 @@ public class InjectorJob extends NutchTo
*/
public static String nutchFetchIntervalMDName = "nutch.fetchInterval";
- public static class UrlMapper extends
- Mapper<LongWritable, Text, String, WebPage> {
+ public static class UrlMapper extends Mapper<LongWritable, Text, String, WebPage> {
private URLNormalizers urlNormalizers;
private int interval;
private float scoreInjected;
@@ -90,7 +87,7 @@ public class InjectorJob extends NutchTo
@Override
protected void setup(Context context) throws IOException,
- InterruptedException {
+ InterruptedException {
urlNormalizers = new URLNormalizers(context.getConfiguration(),
URLNormalizers.SCOPE_INJECT);
interval = context.getConfiguration().getInt("db.fetch.interval.default",
@@ -117,13 +114,26 @@ public class InjectorJob extends NutchTo
float customScore = -1f;
int customInterval = interval;
Map<String, String> metadata = new TreeMap<String, String>();
+ InjectType injectType = InjectType.INJECT;
if (url.indexOf("\t") != -1) {
String[] splits = url.split("\t");
url = splits[0];
for (int s = 1; s < splits.length; s++) {
// find separation between name and value
int indexEquals = splits[s].indexOf("=");
- if (indexEquals == -1) {
+ if (splits[s].indexOf("sitemaps:") > -1) {
+ String[] sitemaps = splits[s].trim().split(" ");
+ String sitemapUrl;
+ for (int i = 1; i < sitemaps.length; i++) {
+ sitemapUrl = url + sitemaps[i];
+ write(sitemapUrl, context, customInterval, customScore,
+ new HashMap<String, String>(), InjectType.SITEMAP_INJECT);
+ }
+ continue;
+ } else if (splits[s].indexOf("-sitemap") == 0) {
+ injectType = InjectType.SITEMAP_INJECT;
+ continue;
+ } else if (indexEquals == -1) {
// skip anything without a =
continue;
}
@@ -143,6 +153,12 @@ public class InjectorJob extends NutchTo
metadata.put(metaname, metavalue);
}
}
+ write(url, context, customInterval, customScore, metadata, injectType);
+ }
+
+ private void write(String url, Context context, Integer customInterval,
+ Float customScore, Map<String, String> metadata, InjectType injectType)
+ throws IOException, InterruptedException {
try {
url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
url = filters.filter(url); // filter the url
@@ -177,14 +193,13 @@ public class InjectorJob extends NutchTo
scfilters.injectedScore(url, row);
} catch (ScoringFilterException e) {
if (LOG.isWarnEnabled()) {
- LOG.warn("Cannot filter injected score for url " + url
- + ", using default (" + e.getMessage() + ")");
+ LOG.warn("Cannot filter injected score for url {}, using default ({})", url, e.getMessage());
}
}
context.getCounter("injector", "urls_injected").increment(1);
row.getMarkers()
- .put(DbUpdaterJob.DISTANCE, new Utf8(String.valueOf(0)));
- Mark.INJECT_MARK.putMark(row, YES_STRING);
+ .put(DbUpdaterJob.DISTANCE, new Utf8(String.valueOf(0)));
+ Mark.INJECT_MARK.putMark(row, injectType.getTypeString());
context.write(reversedUrl, row);
}
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java Tue Jan 26 19:19:02 2016
@@ -35,6 +35,7 @@ import org.apache.hadoop.util.ToolRunner
import org.apache.nutch.crawl.GeneratorJob;
import org.apache.nutch.crawl.URLPartitioner.FetchEntryPartitioner;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLFilters;
import org.apache.nutch.parse.ParserJob;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.storage.Mark;
@@ -61,6 +62,8 @@ public class FetcherJob extends NutchToo
public static final Utf8 REDIRECT_DISCOVERED = new Utf8("___rdrdsc__");
public static final String RESUME_KEY = "fetcher.job.resume";
+ public static final String SITEMAP = "fetcher.job.sitemap";
+ public static final String SITEMAP_DETECT = "fetcher.job.sitemap.detect";
public static final String PARSE_KEY = "fetcher.parse";
public static final String THREADS_KEY = "fetcher.threads.fetch";
@@ -90,7 +93,7 @@ public class FetcherJob extends NutchToo
* </p>
*/
public static class FetcherMapper extends
- GoraMapper<String, WebPage, IntWritable, FetchEntry> {
+ GoraMapper<String, WebPage, IntWritable, FetchEntry> {
private boolean shouldContinue;
@@ -112,17 +115,22 @@ public class FetcherJob extends NutchToo
if (Mark.GENERATE_MARK.checkMark(page) == null) {
if (LOG.isDebugEnabled()) {
LOG.debug("Skipping " + TableUtil.unreverseUrl(key)
- + "; not generated yet");
+ + "; not generated yet");
}
return;
}
if (shouldContinue && Mark.FETCH_MARK.checkMark(page) != null) {
if (LOG.isDebugEnabled()) {
LOG.debug("Skipping " + TableUtil.unreverseUrl(key)
- + "; already fetched");
+ + "; already fetched");
}
return;
}
+ boolean sitemap = context.getConfiguration().getBoolean(SITEMAP, false);
+
+ if ((sitemap && !URLFilters.isSitemap(page)) || !sitemap && URLFilters
+ .isSitemap(page))
+ return;
context.write(new IntWritable(random.nextInt(65536)), new FetchEntry(
context.getConfiguration(), key, page));
}
@@ -158,6 +166,8 @@ public class FetcherJob extends NutchToo
Integer threads = (Integer) args.get(Nutch.ARG_THREADS);
Boolean shouldResume = (Boolean) args.get(Nutch.ARG_RESUME);
Integer numTasks = (Integer) args.get(Nutch.ARG_NUMTASKS);
+ Boolean stmDetect = (Boolean) args.get(Nutch.ARG_SITEMAP_DETECT);
+ Boolean sitemap = (Boolean) args.get(Nutch.ARG_SITEMAP);
if (threads != null && threads > 0) {
getConf().setInt(THREADS_KEY, threads);
@@ -169,10 +179,16 @@ public class FetcherJob extends NutchToo
if (shouldResume != null) {
getConf().setBoolean(RESUME_KEY, shouldResume);
}
+ if (stmDetect != null) {
+ getConf().setBoolean(SITEMAP_DETECT, stmDetect);
+ }
+ if (sitemap != null) {
+ getConf().setBoolean(SITEMAP, sitemap);
+ }
- LOG.info("FetcherJob: threads: " + getConf().getInt(THREADS_KEY, 10));
- LOG.info("FetcherJob: parsing: " + getConf().getBoolean(PARSE_KEY, false));
- LOG.info("FetcherJob: resuming: " + getConf().getBoolean(RESUME_KEY, false));
+ LOG.info("FetcherJob: threads: {}", getConf().getInt(THREADS_KEY, 10));
+ LOG.info("FetcherJob: parsing: {}", getConf().getBoolean(PARSE_KEY, false));
+ LOG.info("FetcherJob: resuming: {}", getConf().getBoolean(RESUME_KEY, false));
// set the actual time for the timelimit relative
// to the beginning of the whole job and not of a specific task
@@ -182,8 +198,7 @@ public class FetcherJob extends NutchToo
timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
getConf().setLong("fetcher.timelimit", timelimit);
}
- LOG.info("FetcherJob : timelimit set for : "
- + getConf().getLong("fetcher.timelimit", -1));
+ LOG.info("FetcherJob : timelimit set for : {}", getConf().getLong("fetcher.timelimit", -1));
numJobs = 1;
currentJob = NutchJob.getInstance(getConf(), "fetch");
@@ -237,6 +252,31 @@ public class FetcherJob extends NutchToo
*/
public int fetch(String batchId, int threads, boolean shouldResume,
int numTasks) throws Exception {
+ return fetch(batchId, threads, shouldResume, numTasks, false, false);
+ }
+
+ /**
+ * Run fetcher.
+ *
+ * @param batchId
+ * batchId (obtained from Generator) or null to fetch all generated
+ * fetchlists
+ * @param threads
+ * number of threads per map task
+ * @param shouldResume
+ * @param numTasks
+ * number of fetching tasks (reducers). If set to < 1 then use the
+ * default, which is mapred.map.tasks.
+ * @param stmDetect
+ * If set true, sitemap detection is run.
+ * @param sitemap
+ * If set true, only sitemap files is fetched, If set false, only
+ * normal urls is fetched.
+ * @return 0 on success
+ * @throws Exception
+ */
+ public int fetch(String batchId, int threads, boolean shouldResume,
+ int numTasks, boolean stmDetect, boolean sitemap) throws Exception {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
@@ -249,11 +289,12 @@ public class FetcherJob extends NutchToo
}
run(ToolUtil.toArgMap(Nutch.ARG_BATCH, batchId, Nutch.ARG_THREADS, threads,
- Nutch.ARG_RESUME, shouldResume, Nutch.ARG_NUMTASKS, numTasks));
+ Nutch.ARG_RESUME, shouldResume, Nutch.ARG_NUMTASKS, numTasks,
+ Nutch.ARG_SITEMAP_DETECT, stmDetect, Nutch.ARG_SITEMAP, sitemap));
long finish = System.currentTimeMillis();
LOG.info("FetcherJob: finished at " + sdf.format(finish)
- + ", time elapsed: " + TimingUtil.elapsedTime(start, finish));
+ + ", time elapsed: " + TimingUtil.elapsedTime(start, finish));
return 0;
}
@@ -275,6 +316,7 @@ public class FetcherJob extends NutchToo
public int run(String[] args) throws Exception {
int threads = -1;
boolean shouldResume = false;
+ boolean stmRobot = false, sitemap = false;
String batchId;
String usage = "Usage: FetcherJob (<batchId> | -all) [-crawlId <id>] "
@@ -283,7 +325,9 @@ public class FetcherJob extends NutchToo
+ " -crawlId <id> - the id to prefix the schemas to operate on, \n \t \t (default: storage.crawl.id)\n"
+ " -threads N - number of fetching threads per task\n"
+ " -resume - resume interrupted job\n"
- + " -numTasks N - if N > 0 then use this many reduce tasks for fetching \n \t \t (default: mapred.map.tasks)";
+ + " -numTasks N - if N > 0 then use this many reduce tasks for fetching \n \t \t (default: mapred.map.tasks)"
+ + " -sitemap - only sitemap files are fetched, defaults to false"
+ + " -stmDetect - sitemap files are detected from robot.txt file";
if (args.length == 0) {
System.err.println(usage);
@@ -306,13 +350,17 @@ public class FetcherJob extends NutchToo
numTasks = Integer.parseInt(args[++i]);
} else if ("-crawlId".equals(args[i])) {
getConf().set(Nutch.CRAWL_ID_KEY, args[++i]);
+ } else if ("-sitemap".equals(args[i])) {
+ sitemap = true;
+ } else if ("-stmDetect".equals(args[i])) {
+ stmRobot = true;
} else {
throw new IllegalArgumentException("arg " + args[i] + " not recognized");
}
}
- int fetchcode = fetch(batchId, threads, shouldResume, numTasks); // run the
- // Fetcher
+ int fetchcode = fetch(batchId, threads, shouldResume, numTasks, stmRobot,
+ sitemap); // run the Fetcher
return fetchcode;
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java Tue Jan 26 19:19:02 2016
@@ -491,7 +491,8 @@ public class FetcherReducer extends
}
try {
LOG.info("fetching " + fit.url + " (queue crawl delay="
- + fetchQueues.getFetchItemQueue(fit.queueID).crawlDelay + "ms)");
+ + fetchQueues.getFetchItemQueue(fit.queueID).crawlDelay
+ + "ms)");
// fetch the page
final Protocol protocol = this.protocolFactory.getProtocol(fit.url);
@@ -527,6 +528,19 @@ public class FetcherReducer extends
}
}
}
+
+ boolean stmRobot = context.getConfiguration().getBoolean(FetcherJob.SITEMAP_DETECT, false);
+
+ if (stmRobot && (fit.u.getFile() == null
+ || fit.u.getFile().length() == 0 || (
+ fit.u.getFile().length() == 1 && fit.u.getFile().equals(
+ "/")))) {
+ for (String stmUrl : rules.getSitemaps()) {
+ fit.page.getSitemaps()
+ .put(new Utf8(stmUrl), new Utf8());
+ }
+ }
+
final ProtocolOutput output = protocol.getProtocolOutput(fit.url,
fit.page);
final ProtocolStatus status = output.getStatus();
@@ -806,7 +820,13 @@ public class FetcherReducer extends
parse = conf.getBoolean(FetcherJob.PARSE_KEY, false);
storingContent = conf.getBoolean("fetcher.store.content", true);
if (parse) {
- skipTruncated = conf.getBoolean(ParserJob.SKIP_TRUNCATED, true);
+ boolean sitemap = conf.getBoolean(FetcherJob.SITEMAP, false);
+
+ if (sitemap) {
+ skipTruncated = false;
+ } else {
+ skipTruncated = conf.getBoolean(ParserJob.SKIP_TRUNCATED, true);
+ }
parseUtil = new ParseUtil(conf);
}
LOG.info("Fetcher: threads: " + threadCount);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/Metadata.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/Metadata.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/metadata/Metadata.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/Metadata.java Tue Jan 26 19:19:02 2016
@@ -19,10 +19,7 @@ package org.apache.nutch.metadata;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
-import java.util.Enumeration;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Properties;
+import java.util.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
@@ -97,6 +94,15 @@ public class Metadata implements Writabl
return _getValues(name);
}
+ /**
+ * Get the metadata list
+ *
+ * @return the values associated to a metadata name.
+ */
+ public Set<Map.Entry<String, String[]>> getMetaData() {
+ return metadata.entrySet();
+ }
+
private String[] _getValues(final String name) {
String[] values = metadata.get(name);
if (values == null) {
Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java Tue Jan 26 19:19:02 2016
@@ -112,6 +112,10 @@ public interface Nutch {
public static final String ARG_CLASS = "class";
/** Depth (number of cycles) of a crawl. */
public static final String ARG_DEPTH = "depth";
+ /** Sitemaps. */
+ public static final String ARG_SITEMAP = "sitemap";
+ /** Sitemap Detect as fetch . */
+ public static final String ARG_SITEMAP_DETECT = "stmDetect";
// short constants for status / results fields
/** Status / result message. */
Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilters.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilters.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilters.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilters.java Tue Jan 26 19:19:02 2016
@@ -21,10 +21,13 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
+import org.apache.nutch.crawl.InjectType;
import org.apache.nutch.plugin.Extension;
import org.apache.nutch.plugin.ExtensionPoint;
import org.apache.nutch.plugin.PluginRuntimeException;
import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.storage.Mark;
+import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.ObjectCache;
import org.apache.hadoop.conf.Configuration;
@@ -93,4 +96,17 @@ public class URLFilters {
}
return urlString;
}
+
+ /**
+ * If the page is a sitemap, return true
+ *
+ * */
+ public static boolean isSitemap(WebPage page) {
+ if (InjectType.SITEMAP_INJECT.getTypeString().equals(
+ Mark.INJECT_MARK.checkMark(page))) {
+ return true;
+ } else {
+ return false;
+ }
+ }
}
Added: nutch/branches/2.x/src/java/org/apache/nutch/parse/NutchSitemapParse.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/NutchSitemapParse.java?rev=1726853&view=auto
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/NutchSitemapParse.java (added)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/NutchSitemapParse.java Tue Jan 26 19:19:02 2016
@@ -0,0 +1,55 @@
+/**
+ * ****************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ****************************************************************************
+ */
+package org.apache.nutch.parse;
+
+import org.apache.nutch.metadata.Metadata;
+
+import java.util.List;
+import java.util.Map;
+
+public class NutchSitemapParse {
+
+ private Map<Outlink, Metadata> outlinkMap;
+ private org.apache.nutch.storage.ParseStatus parseStatus;
+
+ public NutchSitemapParse() {
+ }
+
+ public NutchSitemapParse(Map<Outlink, Metadata> outlinkMap,
+ org.apache.nutch.storage.ParseStatus parseStatus) {
+ this.outlinkMap = outlinkMap;
+ this.parseStatus = parseStatus;
+ }
+
+ public Map<Outlink, Metadata> getOutlinkMap() {
+ return outlinkMap;
+ }
+
+ public org.apache.nutch.storage.ParseStatus getParseStatus() {
+ return parseStatus;
+ }
+
+ public void setOutlinks(Map<Outlink, Metadata> outlinkMap) {
+ this.outlinkMap = outlinkMap;
+ }
+
+ public void setParseStatus(org.apache.nutch.storage.ParseStatus parseStatus) {
+ this.parseStatus = parseStatus;
+ }
+}
Added: nutch/branches/2.x/src/java/org/apache/nutch/parse/NutchSitemapParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/NutchSitemapParser.java?rev=1726853&view=auto
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/NutchSitemapParser.java (added)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/NutchSitemapParser.java Tue Jan 26 19:19:02 2016
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.*;
+
+import crawlercommons.sitemaps.*;
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+
+import org.apache.nutch.storage.ParseStatus;
+import org.apache.nutch.storage.WebPage;
+
+public class NutchSitemapParser {
+
+ private Configuration conf;
+
+ private static Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+
+ static {
+ FIELDS.add(WebPage.Field.BASE_URL);
+ }
+
+ public NutchSitemapParse getParse(String url, WebPage page) {
+ NutchSitemapParse nutchSitemapParse = null;
+ SiteMapParser parser = new SiteMapParser();
+
+ AbstractSiteMap siteMap = null;
+ String contentType = page.getContentType().toString();
+ try {
+ siteMap = parser
+ .parseSiteMap(contentType, page.getContent().array(),
+ new URL(url));
+ } catch (UnknownFormatException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ Map<Outlink, Metadata> outlinkMap = null;
+ Iterator i$;
+ if (siteMap.isIndex()) {
+ Collection<AbstractSiteMap> links = ((SiteMapIndex) siteMap)
+ .getSitemaps();
+ for (AbstractSiteMap siteMapIndex : links) {
+ page.getSitemaps().put(new Utf8(siteMapIndex.getUrl().toString()),
+ new Utf8("parser"));
+ }
+
+ } else {
+ Collection<SiteMapURL> links = ((SiteMap) siteMap).getSiteMapUrls();
+ outlinkMap = new HashMap<Outlink, Metadata>();
+
+ for (SiteMapURL sitemapUrl : links) {
+ Metadata metadata = new Metadata();
+ metadata
+ .add("changeFrequency", sitemapUrl.getChangeFrequency().name());
+ metadata.add("lastModified", Long.toString(
+ sitemapUrl.getLastModified().getTime()));
+ metadata.add("priority", Double.toString(sitemapUrl.getPriority()));
+ try {
+ outlinkMap.put(
+ new Outlink(sitemapUrl.getUrl().toString(), "sitemap.outlink"),
+ metadata);
+ } catch (MalformedURLException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+ ParseStatus status = ParseStatus.newBuilder().build();
+ status.setMajorCode((int) ParseStatusCodes.SUCCESS);
+ nutchSitemapParse = new NutchSitemapParse(outlinkMap, status);
+ return nutchSitemapParse;
+ }
+
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ }
+
+ public Configuration getConf() {
+ return conf;
+ }
+
+ public Collection<WebPage.Field> getFields() {
+ return FIELDS;
+ }
+}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java Tue Jan 26 19:19:02 2016
@@ -22,24 +22,31 @@ import com.google.common.util.concurrent
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.crawl.CrawlStatus;
+import org.apache.nutch.crawl.InjectType;
import org.apache.nutch.crawl.Signature;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.fetcher.FetcherJob;
+import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.storage.Mark;
+import org.apache.nutch.storage.ParseStatus;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.TableUtil;
import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.ByteBuffer;
+import java.util.Map;
+import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
@@ -56,6 +63,9 @@ import java.util.concurrent.TimeUnit;
*/
public class ParseUtil extends Configured {
+ public enum ChangeFrequency {
+ ALWAYS, HOURLY, DAILY, WEEKLY, MONTHLY, YEARLY, NEVER
+ }
/* our log stream */
public static final Logger LOG = LoggerFactory.getLogger(ParseUtil.class);
@@ -90,7 +100,12 @@ public class ParseUtil extends Configure
public void setConf(Configuration conf) {
this.conf = conf;
parserFactory = new ParserFactory(conf);
- maxParseTime = conf.getInt("parser.timeout", DEFAULT_MAX_PARSE_TIME);
+ if (conf.getBoolean("parse.sitemap", false)) {
+ maxParseTime = conf.getInt("parser.timeout", DEFAULT_MAX_PARSE_TIME);
+ } else {
+ maxParseTime = conf
+ .getInt("sitemap.parser.timeout", DEFAULT_MAX_PARSE_TIME);
+ }
sig = SignatureFactory.getSignature(conf);
filters = new URLFilters(conf);
normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
@@ -113,25 +128,15 @@ public class ParseUtil extends Configure
* @throws ParseException
* If there is an error parsing.
*/
- public Parse parse(String url, WebPage page) throws ParserNotFound,
- ParseException {
+ public Parse parse(String url, WebPage page) throws ParseException {
Parser[] parsers = null;
+ Parse parse = null;
String contentType = TableUtil.toString(page.getContentType());
-
parsers = this.parserFactory.getParsers(contentType, url);
for (int i = 0; i < parsers.length; i++) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("Parsing [" + url + "] with [" + parsers[i] + "]");
- }
- Parse parse = null;
-
- if (maxParseTime != -1)
- parse = runParser(parsers[i], url, page);
- else
- parse = parsers[i].getParse(url, page);
-
+ parse = parse(url, page, parsers[i]);
if (parse != null && ParseStatusUtils.isSuccess(parse.getParseStatus())) {
return parse;
}
@@ -143,6 +148,17 @@ public class ParseUtil extends Configure
"Unable to successfully parse content"), null);
}
+ private Parse parse(String url, WebPage page, Parser parser) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Parsing [" + url + "] with [" + parser + "]");
+ }
+ if (maxParseTime != -1) {
+ return runParser(parser, url, page);
+ } else {
+ return parser.getParse(url, page);
+ }
+ }
+
private Parse runParser(Parser p, String url, WebPage page) {
ParseCallable pc = new ParseCallable(p, page, url);
Future<Parse> task = executorService.submit(pc);
@@ -158,24 +174,157 @@ public class ParseUtil extends Configure
return res;
}
- /**
- * Parses given web page and stores parsed content within page. Puts a
- * meta-redirect to outlinks.
- *
- * @param key
- * @param page
- */
- public void process(String key, WebPage page) {
- String url = TableUtil.unreverseUrl(key);
+ public boolean status(String url, WebPage page) {
byte status = page.getStatus().byteValue();
if (status != CrawlStatus.STATUS_FETCHED) {
if (LOG.isDebugEnabled()) {
LOG.debug("Skipping " + url + " as status is: "
+ CrawlStatus.getName(status));
}
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Parses given sitemap page and stores parsed content within page.
+ *
+ */
+ public void processSitemapParse(String url, WebPage page,
+ Mapper.Context context) {
+ if (status(url, page)) {
return;
}
+ NutchSitemapParser sParser = new NutchSitemapParser();
+ NutchSitemapParse nutchSitemapParse = sParser.getParse(url, page);
+
+ if (nutchSitemapParse == null) {
+ return;
+ }
+
+ ParseStatus pstatus = nutchSitemapParse.getParseStatus();
+ page.setParseStatus(pstatus);
+ if (ParseStatusUtils.isSuccess(pstatus)) {
+ final Map<Outlink, Metadata> outlinkMap = nutchSitemapParse
+ .getOutlinkMap();
+ if (pstatus.getMinorCode() == ParseStatusCodes.SUCCESS_REDIRECT) {
+ successRedirect(url, page, pstatus);
+ } else if (outlinkMap != null) {
+ Set<Outlink> outlinks = outlinkMap.keySet();
+ setSignature(page);
+
+ for (Outlink outlink : outlinks) {
+ String toUrl = outlink.getToUrl();
+
+ try {
+ toUrl = normalizers.normalize(toUrl, URLNormalizers.SCOPE_OUTLINK);
+ toUrl = filters.filter(toUrl);
+ } catch (MalformedURLException e2) {
+ return;
+ } catch (URLFilterException e) {
+ return;
+ }
+ if (toUrl == null) {
+ return;
+ }
+ String reversedUrl = null;
+ try {
+ reversedUrl = TableUtil.reverseUrl(toUrl); // collect it
+ } catch (MalformedURLException e) {
+ e.printStackTrace();
+ }
+ WebPage newRow = WebPage.newBuilder().build();
+ Set<Map.Entry<String, String[]>> metaDatas = outlinkMap.get(outlink)
+ .getMetaData();
+ for (Map.Entry<String, String[]> metadata : metaDatas) {
+ System.out.println();
+ newRow.getMetadata().put(new Utf8(metadata.getKey()),
+ ByteBuffer.wrap(metadata.getValue()[0].getBytes()));
+ }
+
+ int changeFrequency = calculateFetchInterval(
+ outlinkMap.get(outlink).get("changeFrequency"));
+ String modifiedTime = outlinkMap.get(outlink).get("lastModified");
+
+ newRow.setFetchInterval(changeFrequency);
+ newRow.setModifiedTime(Long.valueOf(modifiedTime));
+ newRow.setStmPriority(
+ Float.parseFloat(outlinkMap.get(outlink).get("priority")));
+
+ Mark.INJECT_MARK.putMark(newRow, InjectType.SITEMAP_INJECT.getTypeString());
+
+ try {
+ context.write(reversedUrl, newRow);
+ } catch (IOException e) {
+ e.printStackTrace();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+
+ parseMark(page);
+ }
+ }
+
+ }
+
+ private int calculateFetchInterval(String changeFrequency) {
+ if (changeFrequency.equals(ChangeFrequency.ALWAYS.toString())
+ || changeFrequency.equals(ChangeFrequency.HOURLY.toString())) {
+ return 3600; // 60 * 60
+ } else if (changeFrequency.equals(ChangeFrequency.DAILY.toString())) {
+ return 86400; // 24 * 60 * 60
+ } else if (changeFrequency.equals(ChangeFrequency.WEEKLY.toString())) {
+ return 604800; // 7 * 24 * 60 * 60
+ } else if (changeFrequency.equals(ChangeFrequency.MONTHLY.toString())) {
+ return 2628000; // average seconds in one month
+ } else if (changeFrequency.equals(ChangeFrequency.YEARLY.toString())
+ || changeFrequency.equals(ChangeFrequency.NEVER.toString())) {
+ return 31536000; // average seconds in one year
+ } else {
+ return Integer.MAX_VALUE; // other intervals are larger than Integer.MAX_VALUE
+ }
+ }
+
+ private void parseMark(WebPage page) {
+ Utf8 fetchMark = Mark.FETCH_MARK.checkMark(page);
+ if (fetchMark != null) {
+ Mark.PARSE_MARK.putMark(page, fetchMark);
+ }
+ }
+
+ private void putOutlink(WebPage page, Outlink outlink, String toUrl) {
+ try {
+ toUrl = normalizers.normalize(toUrl, URLNormalizers.SCOPE_OUTLINK);
+ toUrl = filters.filter(toUrl);
+ } catch (MalformedURLException e2) {
+ return;
+ } catch (URLFilterException e) {
+ return;
+ }
+ if (toUrl == null) {
+ return;
+ }
+ Utf8 utf8ToUrl = new Utf8(toUrl);
+ if (page.getOutlinks().get(utf8ToUrl) != null) {
+ // skip duplicate outlinks
+ return;
+ }
+ page.getOutlinks().put(utf8ToUrl, new Utf8(outlink.getAnchor()));
+ }
+
+ /**
+ * Parses given web page and stores parsed content within page. Puts a
+ * meta-redirect to outlinks.
+ *
+ * @param url
+ * @param page
+ */
+ public void process(String url, WebPage page) {
+ if (status(url, page)) {
+ return;
+ }
Parse parse;
try {
parse = parse(url, page);
@@ -193,58 +342,20 @@ public class ParseUtil extends Configure
return;
}
- org.apache.nutch.storage.ParseStatus pstatus = parse.getParseStatus();
+ ParseStatus pstatus = parse.getParseStatus();
page.setParseStatus(pstatus);
if (ParseStatusUtils.isSuccess(pstatus)) {
if (pstatus.getMinorCode() == ParseStatusCodes.SUCCESS_REDIRECT) {
- String newUrl = ParseStatusUtils.getMessage(pstatus);
- int refreshTime = Integer.parseInt(ParseStatusUtils.getArg(pstatus, 1));
- try {
- newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
- if (newUrl == null) {
- LOG.warn("redirect normalized to null " + url);
- return;
- }
- try {
- newUrl = filters.filter(newUrl);
- } catch (URLFilterException e) {
- return;
- }
- if (newUrl == null) {
- LOG.warn("redirect filtered to null " + url);
- return;
- }
- } catch (MalformedURLException e) {
- LOG.warn("malformed url exception parsing redirect " + url);
- return;
- }
- page.getOutlinks().put(new Utf8(newUrl), new Utf8());
- page.getMetadata().put(FetcherJob.REDIRECT_DISCOVERED,
- TableUtil.YES_VAL);
- if (newUrl == null || newUrl.equals(url)) {
- String reprUrl = URLUtil.chooseRepr(url, newUrl,
- refreshTime < FetcherJob.PERM_REFRESH_TIME);
- if (reprUrl == null) {
- LOG.warn("reprUrl==null for " + url);
- return;
- } else {
- page.setReprUrl(new Utf8(reprUrl));
- }
- }
+ successRedirect(url, page, pstatus);
} else {
page.setText(new Utf8(parse.getText()));
page.setTitle(new Utf8(parse.getTitle()));
- ByteBuffer prevSig = page.getSignature();
- if (prevSig != null) {
- page.setPrevSignature(prevSig);
- }
- final byte[] signature = sig.calculate(page);
- page.setSignature(ByteBuffer.wrap(signature));
+
+ setSignature(page);
+
if (page.getOutlinks() != null) {
page.getOutlinks().clear();
}
- final Outlink[] outlinks = parse.getOutlinks();
- int outlinksToStore = Math.min(maxOutlinks, outlinks.length);
String fromHost;
if (ignoreExternalLinks) {
try {
@@ -257,24 +368,11 @@ public class ParseUtil extends Configure
}
int validCount = 0;
- for (int i = 0; validCount < outlinksToStore && i < outlinks.length; i++) {
+ final Outlink[] outlinks = parse.getOutlinks();
+ int outlinksToStore = Math.min(maxOutlinks, outlinks.length);
+ for (int i = 0; validCount < outlinksToStore
+ && i < outlinks.length; i++, validCount++) {
String toUrl = outlinks[i].getToUrl();
- try {
- toUrl = normalizers.normalize(toUrl, URLNormalizers.SCOPE_OUTLINK);
- toUrl = filters.filter(toUrl);
- } catch (MalformedURLException e2) {
- continue;
- } catch (URLFilterException e) {
- continue;
- }
- if (toUrl == null) {
- continue;
- }
- Utf8 utf8ToUrl = new Utf8(toUrl);
- if (page.getOutlinks().get(utf8ToUrl) != null) {
- // skip duplicate outlinks
- continue;
- }
String toHost;
if (ignoreExternalLinks) {
try {
@@ -286,14 +384,56 @@ public class ParseUtil extends Configure
continue; // skip it
}
}
- validCount++;
- page.getOutlinks().put(utf8ToUrl, new Utf8(outlinks[i].getAnchor()));
- }
- Utf8 fetchMark = Mark.FETCH_MARK.checkMark(page);
- if (fetchMark != null) {
- Mark.PARSE_MARK.putMark(page, fetchMark);
+ putOutlink(page, outlinks[i], toUrl);
}
+ parseMark(page);
+ }
+ }
+ }
+
+ private void successRedirect(String url, WebPage page, ParseStatus pstatus) {
+ String newUrl = ParseStatusUtils.getMessage(pstatus);
+ int refreshTime = Integer.parseInt(ParseStatusUtils.getArg(pstatus, 1));
+ try {
+ newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
+ if (newUrl == null) {
+ LOG.warn("redirect normalized to null " + url);
+ return;
+ }
+ try {
+ newUrl = filters.filter(newUrl);
+ } catch (URLFilterException e) {
+ return;
+ }
+ if (newUrl == null) {
+ LOG.warn("redirect filtered to null " + url);
+ return;
}
+ } catch (MalformedURLException e) {
+ LOG.warn("malformed url exception parsing redirect " + url);
+ return;
+ }
+ page.getOutlinks().put(new Utf8(newUrl), new Utf8());
+ page.getMetadata().put(FetcherJob.REDIRECT_DISCOVERED,
+ TableUtil.YES_VAL);
+ if (newUrl == null || newUrl.equals(url)) {
+ String reprUrl = URLUtil.chooseRepr(url, newUrl,
+ refreshTime < FetcherJob.PERM_REFRESH_TIME);
+ if (reprUrl == null) {
+ LOG.warn("reprUrl==null for " + url);
+ return;
+ } else {
+ page.setReprUrl(new Utf8(reprUrl));
+ }
+ }
+ }
+
+ private void setSignature(WebPage page) {
+ ByteBuffer prevSig = page.getSignature();
+ if (prevSig != null) {
+ page.setPrevSignature(prevSig);
}
+ final byte[] signature = sig.calculate(page);
+ page.setSignature(ByteBuffer.wrap(signature));
}
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java Tue Jan 26 19:19:02 2016
@@ -35,6 +35,7 @@ import org.apache.nutch.crawl.GeneratorJ
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.metadata.HttpHeaders;
import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLFilters;
import org.apache.nutch.storage.Mark;
import org.apache.nutch.storage.ParseStatus;
import org.apache.nutch.storage.StorageUtils;
@@ -61,6 +62,8 @@ public class ParserJob extends NutchTool
private static final Utf8 REPARSE = new Utf8("-reparse");
+ private static String SITEMAP_PARSE = "parse.sitemap";
+
private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
private Configuration conf;
@@ -75,16 +78,19 @@ public class ParserJob extends NutchTool
FIELDS.add(WebPage.Field.OUTLINKS);
FIELDS.add(WebPage.Field.METADATA);
FIELDS.add(WebPage.Field.HEADERS);
+ FIELDS.add(WebPage.Field.SITEMAPS);
+ FIELDS.add(WebPage.Field.STM_PRIORITY);
}
- public static class ParserMapper extends
- GoraMapper<String, WebPage, String, WebPage> {
+ public static class ParserMapper extends GoraMapper<String, WebPage, String, WebPage> {
private ParseUtil parseUtil;
private boolean shouldResume;
private boolean force;
+ private boolean sitemap;
+
private Utf8 batchId;
private boolean skipTruncated;
@@ -95,9 +101,15 @@ public class ParserJob extends NutchTool
parseUtil = new ParseUtil(conf);
shouldResume = conf.getBoolean(RESUME_KEY, false);
force = conf.getBoolean(FORCE_KEY, false);
+ sitemap = conf.getBoolean(SITEMAP_PARSE, false);
batchId = new Utf8(
conf.get(GeneratorJob.BATCH_ID, Nutch.ALL_BATCH_ID_STR));
skipTruncated = conf.getBoolean(SKIP_TRUNCATED, true);
+ if (sitemap) {
+ skipTruncated = false;
+ } else {
+ skipTruncated = conf.getBoolean(SKIP_TRUNCATED, true);
+ }
}
@Override
@@ -109,8 +121,7 @@ public class ParserJob extends NutchTool
} else {
if (Mark.FETCH_MARK.checkMark(page) == null) {
if (LOG.isDebugEnabled()) {
- LOG.debug("Skipping " + TableUtil.unreverseUrl(key)
- + "; not fetched yet");
+ LOG.debug("Skipping {}, not fetched yet", unreverseKey);
}
return;
}
@@ -130,7 +141,12 @@ public class ParserJob extends NutchTool
return;
}
- parseUtil.process(key, page);
+ if (sitemap && URLFilters.isSitemap(page)) {
+ LOG.info("Parsing for sitemap"); //TODO this log should be top line
+ parseUtil.processSitemapParse(unreverseKey, page, context);
+ } else {
+ parseUtil.process(unreverseKey, page);
+ }
ParseStatus pstatus = page.getParseStatus();
if (pstatus != null) {
context.getCounter("ParserStatus",
@@ -230,6 +246,7 @@ public class ParserJob extends NutchTool
String batchId = (String) args.get(Nutch.ARG_BATCH);
Boolean shouldResume = (Boolean) args.get(Nutch.ARG_RESUME);
Boolean force = (Boolean) args.get(Nutch.ARG_FORCE);
+ Boolean sitemap = (Boolean) args.get(Nutch.ARG_SITEMAP);
if (batchId != null) {
getConf().set(GeneratorJob.BATCH_ID, batchId);
@@ -240,13 +257,15 @@ public class ParserJob extends NutchTool
if (force != null) {
getConf().setBoolean(FORCE_KEY, force);
}
- LOG.info("ParserJob: resuming:\t" + getConf().getBoolean(RESUME_KEY, false));
- LOG.info("ParserJob: forced reparse:\t"
- + getConf().getBoolean(FORCE_KEY, false));
+ if (sitemap != null) {
+ getConf().setBoolean(SITEMAP_PARSE, sitemap);
+ }
+ LOG.info("ParserJob: resuming:\t{}", getConf().getBoolean(RESUME_KEY, false));
+ LOG.info("ParserJob: forced reparse:\t {}", getConf().getBoolean(FORCE_KEY, false));
if (batchId == null || batchId.equals(Nutch.ALL_BATCH_ID_STR)) {
LOG.info("ParserJob: parsing all");
} else {
- LOG.info("ParserJob: batchId:\t" + batchId);
+ LOG.info("ParserJob: batchId:\t{}", batchId);
}
currentJob = NutchJob.getInstance(getConf(), "parse");
@@ -278,39 +297,48 @@ public class ParserJob extends NutchTool
public int parse(String batchId, boolean shouldResume, boolean force)
throws Exception {
+ return parse(batchId, shouldResume, force, false);
+ }
+
+ public int parse(String batchId, boolean shouldResume, boolean force,
+ boolean sitemap)
+ throws Exception {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
- LOG.info("ParserJob: starting at " + sdf.format(start));
+ LOG.info("ParserJob: starting at {}", sdf.format(start));
run(ToolUtil.toArgMap(Nutch.ARG_BATCH, batchId, Nutch.ARG_RESUME,
- shouldResume, Nutch.ARG_FORCE, force));
+ shouldResume, Nutch.ARG_FORCE, force, Nutch.ARG_SITEMAP, sitemap));
LOG.info("ParserJob: success");
long finish = System.currentTimeMillis();
- LOG.info("ParserJob: finished at " + sdf.format(finish)
- + ", time elapsed: " + TimingUtil.elapsedTime(start, finish));
+ LOG.info("ParserJob: finished at {}, time elapsed: {}",
+ sdf.format(finish), TimingUtil.elapsedTime(start, finish));
return 0;
}
public int run(String[] args) throws Exception {
boolean shouldResume = false;
boolean force = false;
+ boolean sitemap = false;
String batchId = null;
if (args.length < 1) {
System.err
- .println("Usage: ParserJob (<batchId> | -all) [-crawlId <id>] [-resume] [-force]");
+ .println("Usage: ParserJob (<batchId> | -all) [-crawlId <id>] [-resume] [-force] [-sitemap]");
+ System.err
+ .println(" <batchId> - symbolic batch ID created by Generator");
System.err
- .println(" <batchId> - symbolic batch ID created by Generator");
+ .println(" -crawlId <id> - the id to prefix the schemas to operate on, \n \t \t (default: storage.crawl.id)");
System.err
- .println(" -crawlId <id> - the id to prefix the schemas to operate on, \n \t \t (default: storage.crawl.id)");
+ .println(" -all - consider pages from all crawl jobs");
System.err
- .println(" -all - consider pages from all crawl jobs");
+ .println(" -sitemap - parse only sitemap pages, default false");
System.err
- .println(" -resume - resume a previous incomplete job");
+ .println(" -resume - resume a previous incomplete job");
System.err
- .println(" -force - force re-parsing even if a page is already parsed");
+ .println(" -force - force re-parsing even if a page is already parsed");
return -1;
}
for (int i = 0; i < args.length; i++) {
@@ -322,6 +350,8 @@ public class ParserJob extends NutchTool
getConf().set(Nutch.CRAWL_ID_KEY, args[++i]);
} else if ("-all".equals(args[i])) {
batchId = args[i];
+ } else if ("-sitemap".equals(args[i])) {
+ sitemap = true;
} else {
if (batchId != null) {
System.err.println("BatchId already set to '" + batchId + "'!");
@@ -334,7 +364,7 @@ public class ParserJob extends NutchTool
System.err.println("BatchId not set (or -all/-reparse not specified)!");
return -1;
}
- return parse(batchId, shouldResume, force);
+ return parse(batchId, shouldResume, force, sitemap);
}
public static void main(String[] args) throws Exception {
Modified: nutch/branches/2.x/src/java/org/apache/nutch/storage/Mark.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/Mark.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/Mark.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/Mark.java Tue Jan 26 19:19:02 2016
@@ -19,8 +19,9 @@ package org.apache.nutch.storage;
import org.apache.avro.util.Utf8;
public enum Mark {
- INJECT_MARK("_injmrk_"), GENERATE_MARK("_gnmrk_"), FETCH_MARK("_ftcmrk_"), PARSE_MARK(
- "__prsmrk__"), UPDATEDB_MARK("_updmrk_"), INDEX_MARK("_idxmrk_");
+ INJECT_MARK("_injmrk_"), GENERATE_MARK("_gnmrk_"), FETCH_MARK("_ftcmrk_"),
+ PARSE_MARK("__prsmrk__"), UPDATEDB_MARK("_updmrk_"), INDEX_MARK("_idxmrk_"),
+ SITEMAP_MARK("_stmmrk_");
private Utf8 name;