You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2016/01/26 20:19:03 UTC

svn commit: r1726853 [1/2] - in /nutch/branches/2.x: ./ conf/ src/gora/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/parse/ src/java/org...

Author: lewismc
Date: Tue Jan 26 19:19:02 2016
New Revision: 1726853

URL: http://svn.apache.org/viewvc?rev=1726853&view=rev
Log:
NUTCH-1741 Support of Sitemaps in Nutch 2.x

Added:
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectType.java
    nutch/branches/2.x/src/java/org/apache/nutch/parse/NutchSitemapParse.java
    nutch/branches/2.x/src/java/org/apache/nutch/parse/NutchSitemapParser.java
    nutch/branches/2.x/src/test/org/apache/nutch/parse/TestSitemapParser.java
    nutch/branches/2.x/src/test/org/apache/nutch/util/HelloHandler.java
    nutch/branches/2.x/src/testresources/fetch-test-site/sitemap1.xml
    nutch/branches/2.x/src/testresources/fetch-test-site/sitemap2.xml
    nutch/branches/2.x/src/testresources/fetch-test-site/sitemapIndex.xml
Modified:
    nutch/branches/2.x/.gitignore
    nutch/branches/2.x/conf/gora-accumulo-mapping.xml
    nutch/branches/2.x/conf/gora-cassandra-mapping.xml
    nutch/branches/2.x/conf/gora-hbase-mapping.xml
    nutch/branches/2.x/conf/gora-mongodb-mapping.xml
    nutch/branches/2.x/conf/gora-solr-mapping.xml
    nutch/branches/2.x/conf/nutch-default.xml
    nutch/branches/2.x/src/gora/webpage.avsc
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
    nutch/branches/2.x/src/java/org/apache/nutch/metadata/Metadata.java
    nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java
    nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilters.java
    nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
    nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/storage/Mark.java
    nutch/branches/2.x/src/java/org/apache/nutch/storage/WebPage.java
    nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java
    nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
    nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java
    nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java
    nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java
    nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java
    nutch/branches/2.x/src/testresources/fetch-test-site/robots.txt

Modified: nutch/branches/2.x/.gitignore
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/.gitignore?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/.gitignore (original)
+++ nutch/branches/2.x/.gitignore Tue Jan 26 19:19:02 2016
@@ -5,3 +5,7 @@ conf/slaves
 build/
 runtime/
 logs/
+*.iml
+.idea
+.log
+

Modified: nutch/branches/2.x/conf/gora-accumulo-mapping.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/gora-accumulo-mapping.xml?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/conf/gora-accumulo-mapping.xml (original)
+++ nutch/branches/2.x/conf/gora-accumulo-mapping.xml Tue Jan 26 19:19:02 2016
@@ -32,6 +32,7 @@
         <family name="s" />
         <family name="il" />
         <family name="ol" />
+        <family name="stm" />
         <family name="h" />
         <family name="mtdt" />
         <family name="mk" />
@@ -53,6 +54,7 @@
         <field name="modifiedTime" family="f" qualifier="mod"/>
         <field name="prevModifiedTime" family="f" qualifier="pmod"/>
         <field name="batchId" family="f" qualifier="bid"/>
+        <field name="sitemaps" family="stm"/>
         
         <!-- parse fields                                       -->
         <field name="title" family="p" qualifier="t"/>
@@ -63,11 +65,12 @@
         
         <!-- score fields                                       -->
         <field name="score" family="s" qualifier="s"/>
-        <field name="headers" family="h"/>
-        <field name="inlinks" family="il"/>
-        <field name="outlinks" family="ol"/>
-        <field name="metadata" family="mtdt"/>
-        <field name="markers" family="mk"/>
+        <field name="stmPriority" family="s" qualifier="sp"/>
+        <field name="headers" family="h" qualifier="hea"/>
+        <field name="inlinks" family="il" qualifier="inl"/>
+        <field name="outlinks" family="ol" qualifier="out"/>
+        <field name="metadata" family="mtdt" qualifier="met"/>
+        <field name="markers" family="mk" qualifier="mar"/>
     </class>
     
     <table name="host">
@@ -77,9 +80,9 @@
     </table>
     
     <class table="host" keyClass="java.lang.String" name="org.apache.nutch.storage.Host">
-        <field name="metadata" family="mtdt"/>
-        <field name="inlinks" family="il"/>
-        <field name="outlinks" family="ol"/>
+        <field name="metadata" family="mtdt" qualifier="met"/>
+        <field name="inlinks" family="il" qualifier="inl"/>
+        <field name="outlinks" family="ol" qualifier="out"/>
     </class>
     
 </gora-orm>

Modified: nutch/branches/2.x/conf/gora-cassandra-mapping.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/gora-cassandra-mapping.xml?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/conf/gora-cassandra-mapping.xml (original)
+++ nutch/branches/2.x/conf/gora-cassandra-mapping.xml Tue Jan 26 19:19:02 2016
@@ -58,6 +58,7 @@
         <field name="modifiedTime" family="f" qualifier="mod" ttl="0"/>
         <field name="prevModifiedTime" family="f" qualifier="pmod" ttl="0"/>
         <field name="batchId" family="f" qualifier="bid" ttl="0"/>
+        <field name="sitemaps" family="f" qualifier="stm" ttl="0"/>
         
         <!-- parse fields -->
         <field name="title" family="p" qualifier="t" ttl="0"/>
@@ -67,6 +68,7 @@
         
         <!-- score fields -->
         <field name="score" family="f" qualifier="s" ttl="0"/>
+        <field name="stmPriority" family="f" qualifier="sp" ttl="0"/>
         
         <!-- super columns -->
         <field name="headers" family="sc" qualifier="h" ttl="0"/>

Modified: nutch/branches/2.x/conf/gora-hbase-mapping.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/gora-hbase-mapping.xml?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/conf/gora-hbase-mapping.xml (original)
+++ nutch/branches/2.x/conf/gora-hbase-mapping.xml Tue Jan 26 19:19:02 2016
@@ -46,6 +46,7 @@ http://gora.apache.org/current/gora-hbas
         <family name="s" maxVersions="1"/>
         <family name="il" maxVersions="1"/>
         <family name="ol" maxVersions="1"/>
+        <family name="stm" maxVersions="1"/>
         <family name="h" maxVersions="1"/>
         <family name="mtdt" maxVersions="1"/>
         <family name="mk" maxVersions="1"/>
@@ -66,6 +67,8 @@ http://gora.apache.org/current/gora-hbas
         <field name="modifiedTime" family="f" qualifier="mod"/>
         <field name="prevModifiedTime" family="f" qualifier="pmod"/>
         <field name="batchId" family="f" qualifier="bid"/>
+     	  <field name="sitemaps" family="stm"/>
+     
 
         <!-- parse fields                                       -->
         <field name="title" family="p" qualifier="t"/>
@@ -76,6 +79,8 @@ http://gora.apache.org/current/gora-hbas
         
         <!-- score fields                                       -->
         <field name="score" family="s" qualifier="s"/>
+        <field name="stmPriority" family="s" qualifier="sp"/>
+
         <field name="headers" family="h"/>
         <field name="inlinks" family="il"/>
         <field name="outlinks" family="ol"/>

Modified: nutch/branches/2.x/conf/gora-mongodb-mapping.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/gora-mongodb-mapping.xml?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/conf/gora-mongodb-mapping.xml (original)
+++ nutch/branches/2.x/conf/gora-mongodb-mapping.xml Tue Jan 26 19:19:02 2016
@@ -35,6 +35,7 @@
         <field name="modifiedTime" docfield="modifiedTime" type="int64"/>
         <field name="prevModifiedTime" docfield="prevModifiedTime" type="int64"/>
         <field name="batchId" docfield="batchId" type="string"/>
+        <field name="sitemaps" docfield="sitemaps" type="document"/>
         
         <!-- parse fields -->
         <field name="title" docfield="title" type="string"/>
@@ -43,6 +44,7 @@
         <field name="prevSignature" docfield="prevSignature" type="string"/>
         <!-- score fields -->
         <field name="score" docfield="score" type="int32"/>
+        <field name="stmPriority" family="stmPriority" type="int32"/>
         <field name="headers" docfield="headers" type="document"/>
         <field name="inlinks" docfield="inlinks" type="document"/>
         <field name="outlinks" docfield="outlinks" type="document"/>

Modified: nutch/branches/2.x/conf/gora-solr-mapping.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/gora-solr-mapping.xml?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/conf/gora-solr-mapping.xml (original)
+++ nutch/branches/2.x/conf/gora-solr-mapping.xml Tue Jan 26 19:19:02 2016
@@ -35,6 +35,7 @@
         <field name="modifiedTime" column="modifiedTime"/>
         <field name="prevModifiedTime" column="prevModifiedTime" />
         <field name="batchId" column="batchId" />
+        <field name="sitemaps" family="sitemaps"/>
         
         <!-- parse fields -->
         <field name="title" column="title" />
@@ -43,6 +44,7 @@
         <field name="prevSignature" column="prevSignature"/>
         <!-- score fields -->
         <field name="score" column="score"/>
+        <field name="stmPriority" column="stmPriority"/>
         <field name="headers" column="headers"/>
         <field name="inlinks" column="inlinks" />
         <field name="outlinks" column="outlinks"/>

Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Tue Jan 26 19:19:02 2016
@@ -216,6 +216,16 @@
 </property>
 
 <property>
+  <name>sitemap.content.limit</name>
+  <value>-1</value>
+  <description>The length limit for downloaded content using the http
+  protocol for sitemap, in bytes. If this value is nonnegative (>=0),
+  content longer than it will be truncated; otherwise, no truncation at all. Do not
+  confuse this setting with the file.content.limit setting.
+  </description>
+</property>
+
+<property>
   <name>http.proxy.host</name>
   <value></value>
   <description>The proxy hostname.  If empty, no proxy is used.</description>
@@ -1008,6 +1018,17 @@
   Set to -1 to deactivate, bearing in mind that this could cause
   the parsing to crash because of a very long or corrupted document.
   </description>
+</property>
+
+<property>
+  <name>sitemap.parser.timeout</name>
+  <value>30</value>
+  <description>Timeout in seconds for the parsing of a document, otherwise
+  treats it as an exception and moves on the the following documents. 
+  This parameter is applied to Sitemap Parser implementation.
+  Set to -1 to deactivate, bearing in mind that this could cause
+  the parsing to crash because of a very long or corrupted document.
+  </description>
 </property>
 
 <property>

Modified: nutch/branches/2.x/src/gora/webpage.avsc
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/gora/webpage.avsc?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/gora/webpage.avsc (original)
+++ nutch/branches/2.x/src/gora/webpage.avsc Tue Jan 26 19:19:02 2016
@@ -278,6 +278,26 @@
       ],
       "doc": "A batchId that this WebPage is assigned to. WebPage's are fetched in batches, called fetchlists. Pages are partitioned but can always be associated and fetched alongside pages of similar value (within a crawl cycle) based on batchId.",
       "default": null
+    },
+    {
+      "name": "sitemaps",
+      "type": {
+        "type": "map",
+        "values": [
+          "null",
+          "string"
+        ]
+      },
+      "doc": "Sitemap urls in robot.txt",
+      "default": {
+
+      },
+      {
+        "name": "stmPriority",
+        "type": "float",
+        "doc": "",
+        "default": 0
+      },
     }
   ]
 }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java Tue Jan 26 19:19:02 2016
@@ -36,7 +36,7 @@ import org.apache.nutch.util.WebPageWrit
 import org.apache.gora.mapreduce.GoraMapper;
 
 public class DbUpdateMapper extends
-    GoraMapper<String, WebPage, UrlWithScore, NutchWritable> {
+GoraMapper<String, WebPage, UrlWithScore, NutchWritable> {
   public static final Logger LOG = DbUpdaterJob.LOG;
 
   private ScoringFilters scoringFilters;
@@ -57,7 +57,7 @@ public class DbUpdateMapper extends
     if (Mark.GENERATE_MARK.checkMark(page) == null) {
       if (LOG.isDebugEnabled()) {
         LOG.debug("Skipping " + TableUtil.unreverseUrl(key)
-            + "; not generated yet");
+        + "; not generated yet");
       }
       return;
     }
@@ -66,16 +66,10 @@ public class DbUpdateMapper extends
 
     scoreData.clear();
     Map<CharSequence, CharSequence> outlinks = page.getOutlinks();
-    if (outlinks != null) {
-      for (Entry<CharSequence, CharSequence> e : outlinks.entrySet()) {
-        int depth = Integer.MAX_VALUE;
-        CharSequence depthUtf8 = page.getMarkers().get(DbUpdaterJob.DISTANCE);
-        if (depthUtf8 != null)
-          depth = Integer.parseInt(depthUtf8.toString());
-        scoreData.add(new ScoreDatum(0.0f, e.getKey().toString(), e.getValue()
-            .toString(), depth));
-      }
-    }
+    addScoreData(page, outlinks);
+
+    Map<CharSequence, CharSequence> sitemaps = page.getSitemaps();
+    addScoreData(page, sitemaps);
 
     // TODO: Outlink filtering (i.e. "only keep the first n outlinks")
     try {
@@ -102,6 +96,19 @@ public class DbUpdateMapper extends
     }
   }
 
+  private void addScoreData(WebPage page, Map<CharSequence, CharSequence> map) {
+    if (map != null) {
+      for (Entry<CharSequence, CharSequence> e : map.entrySet()) {
+        int depth = Integer.MAX_VALUE;
+        CharSequence depthUtf8 = page.getMarkers().get(DbUpdaterJob.DISTANCE);
+        if (depthUtf8 != null)
+          depth = Integer.parseInt(depthUtf8.toString());
+        scoreData.add(new ScoreDatum(0.0f, e.getKey().toString(), e.getValue()
+            .toString(), depth));
+      }
+    }
+  }
+
   @Override
   public void setup(Context context) {
     scoringFilters = new ScoringFilters(context.getConfiguration());

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdaterJob.java Tue Jan 26 19:19:02 2016
@@ -64,6 +64,7 @@ public class DbUpdaterJob extends NutchT
     FIELDS.add(WebPage.Field.PREV_FETCH_TIME);
     FIELDS.add(WebPage.Field.PREV_MODIFIED_TIME);
     FIELDS.add(WebPage.Field.HEADERS);
+    FIELDS.add(WebPage.Field.SITEMAPS);
   }
 
   public static final Utf8 DISTANCE = new Utf8("dist");

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java Tue Jan 26 19:19:02 2016
@@ -51,6 +51,7 @@ public class GeneratorJob extends NutchT
   public static final String GENERATOR_MIN_SCORE = "generate.min.score";
   public static final String GENERATOR_FILTER = "generate.filter";
   public static final String GENERATOR_NORMALISE = "generate.normalise";
+  public static final String GENERATOR_SITEMAP = "generate.sitemap";
   public static final String GENERATOR_MAX_COUNT = "generate.max.count";
   public static final String GENERATOR_COUNT_MODE = "generate.count.mode";
   public static final String GENERATOR_COUNT_VALUE_DOMAIN = "domain";
@@ -75,7 +76,7 @@ public class GeneratorJob extends NutchT
   public static final Logger LOG = LoggerFactory.getLogger(GeneratorJob.class);
 
   public static class SelectorEntry implements
-      WritableComparable<SelectorEntry> {
+  WritableComparable<SelectorEntry> {
 
     String url;
     float score;
@@ -170,7 +171,7 @@ public class GeneratorJob extends NutchT
     String batchId = (curTime / 1000) + "-" + randomSeed;
     return batchId;
   }
-  
+
   public Map<String, Object> run(Map<String, Object> args) throws Exception {
     String batchId = (String) args.get(Nutch.ARG_BATCH);
     if (batchId == null) {
@@ -191,12 +192,16 @@ public class GeneratorJob extends NutchT
     }
     Boolean filter = (Boolean) args.get(Nutch.ARG_FILTER);
     Boolean norm = (Boolean) args.get(Nutch.ARG_NORMALIZE);
+    Boolean sitemap = (Boolean) args.get(Nutch.ARG_SITEMAP);
+
     // map to inverted subset due for fetch, sort by score
     getConf().setLong(GENERATOR_CUR_TIME, curTime);
     if (topN != null)
       getConf().setLong(GENERATOR_TOP_N, topN);
     if (filter != null)
       getConf().setBoolean(GENERATOR_FILTER, filter);
+    if (sitemap != null)
+      getConf().setBoolean(GENERATOR_SITEMAP, sitemap);
 
     getConf().setLong(Nutch.GENERATE_TIME_KEY, System.currentTimeMillis());
     if (norm != null)
@@ -239,23 +244,26 @@ public class GeneratorJob extends NutchT
    * @throws ClassNotFoundException
    * @throws InterruptedException
    * */
-  public String generate(long topN, long curTime, boolean filter, boolean norm)
-      throws Exception {
+  public String generate(long topN, long curTime, boolean filter, boolean norm,
+      boolean sitemap) throws Exception {
 
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();
-    LOG.info("GeneratorJob: starting at " + sdf.format(start));
+    LOG.info("GeneratorJob: starting at {}", sdf.format(start));
     LOG.info("GeneratorJob: Selecting best-scoring urls due for fetch.");
     LOG.info("GeneratorJob: starting");
-    LOG.info("GeneratorJob: filtering: " + filter);
-    LOG.info("GeneratorJob: normalizing: " + norm);
+    LOG.info("GeneratorJob: filtering: {}", filter);
+    LOG.info("GeneratorJob: normalizing: {}", norm);
+    if (sitemap) {
+      LOG.info("GeneratorJob: sitemap: {}", sitemap);
+    }
     if (topN != Long.MAX_VALUE) {
-      LOG.info("GeneratorJob: topN: " + topN);
+      LOG.info("GeneratorJob: topN: {}", topN);
     }
     String batchId = getConf().get(BATCH_ID);
     Map<String, Object> results = run(ToolUtil.toArgMap(Nutch.ARG_TOPN, topN,
         Nutch.ARG_CURTIME, curTime, Nutch.ARG_FILTER, filter,
-        Nutch.ARG_NORMALIZE, norm, Nutch.ARG_BATCH, batchId));
+        Nutch.ARG_NORMALIZE, norm, Nutch.ARG_BATCH, batchId, Nutch.ARG_SITEMAP, sitemap));
     if (batchId == null) {
       // use generated random batch id
       batchId = (String) results.get(BATCH_ID);
@@ -263,10 +271,10 @@ public class GeneratorJob extends NutchT
 
     long finish = System.currentTimeMillis();
     long generateCount = (Long) results.get(GENERATE_COUNT);
-    LOG.info("GeneratorJob: finished at " + sdf.format(finish)
-        + ", time elapsed: " + TimingUtil.elapsedTime(start, finish));
-    LOG.info("GeneratorJob: generated batch id: " + batchId + " containing "
-        + generateCount + " URLs");
+    LOG.info("GeneratorJob: finished at {}, time elapsed: {}", 
+        sdf.format(finish), TimingUtil.elapsedTime(start, finish));
+    LOG.info("GeneratorJob: generated batch id: {} containing {} URLs", 
+        batchId, generateCount);
     if (generateCount == 0) {
       return null;
     }
@@ -276,19 +284,21 @@ public class GeneratorJob extends NutchT
   public int run(String[] args) throws Exception {
     if (args.length <= 0) {
       System.out
-          .println("Usage: GeneratorJob [-topN N] [-crawlId id] [-noFilter] [-noNorm] [-adddays numDays]");
+      .println("Usage: GeneratorJob [-topN N] [-crawlId id] [-noFilter] [-noNorm] [-adddays numDays] [-sitemap]");
+      System.out
+      .println("    -topN <N>      - number of top URLs to be selected, default is Long.MAX_VALUE ");
       System.out
-          .println("    -topN <N>      - number of top URLs to be selected, default is Long.MAX_VALUE ");
+      .println("    -crawlId <id>  - the id to prefix the schemas to operate on, \n \t \t    (default: storage.crawl.id)\");");
       System.out
-          .println("    -crawlId <id>  - the id to prefix the schemas to operate on, \n \t \t    (default: storage.crawl.id)\");");
+      .println("    -noFilter      - do not activate the filter plugin to filter the url, default is true ");
       System.out
-          .println("    -noFilter      - do not activate the filter plugin to filter the url, default is true ");
+      .println("    -noNorm        - do not activate the normalizer plugin to normalize the url, default is true ");
       System.out
-          .println("    -noNorm        - do not activate the normalizer plugin to normalize the url, default is true ");
+      .println("    -adddays       - Adds numDays to the current time to facilitate crawling urls already");
       System.out
-          .println("    -adddays       - Adds numDays to the current time to facilitate crawling urls already");
+      .println("    -sitemap       - generate only sitemap url, default false");
       System.out
-          .println("                     fetched sooner then db.fetch.interval.default. Default value is 0.");
+      .println("                     fetched sooner then db.fetch.interval.default. Default value is 0.");
       System.out.println("    -batchId       - the batch id ");
       System.out.println("----------------------");
       System.out.println("Please set the params.");
@@ -297,6 +307,7 @@ public class GeneratorJob extends NutchT
 
     long curTime = System.currentTimeMillis(), topN = Long.MAX_VALUE;
     boolean filter = true, norm = true;
+    boolean sitemap = false;
 
     for (int i = 0; i < args.length; i++) {
       if ("-topN".equals(args[i])) {
@@ -307,6 +318,8 @@ public class GeneratorJob extends NutchT
         norm = false;
       } else if ("-crawlId".equals(args[i])) {
         getConf().set(Nutch.CRAWL_ID_KEY, args[++i]);
+      } else if ("-sitemap".equals(args[i])) {
+        sitemap = true;
       } else if ("-adddays".equals(args[i])) {
         long numDays = Integer.parseInt(args[++i]);
         curTime += numDays * 1000L * 60 * 60 * 24;
@@ -319,7 +332,7 @@ public class GeneratorJob extends NutchT
     }
 
     try {
-      return (generate(topN, curTime, filter, norm) != null) ? 0 : 1;
+      return (generate(topN, curTime, filter, norm, sitemap) != null) ? 0 : 1;
     } catch (Exception e) {
       LOG.error("GeneratorJob: " + StringUtils.stringifyException(e));
       return -1;

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java Tue Jan 26 19:19:02 2016
@@ -32,12 +32,13 @@ import java.io.IOException;
 import java.net.MalformedURLException;
 
 public class GeneratorMapper extends
-    GoraMapper<String, WebPage, SelectorEntry, WebPage> {
+GoraMapper<String, WebPage, SelectorEntry, WebPage> {
 
   private URLFilters filters;
   private URLNormalizers normalizers;
   private boolean filter;
   private boolean normalise;
+  private boolean sitemap;
   private FetchSchedule schedule;
   private ScoringFilters scoringFilters;
   private long curTime;
@@ -73,13 +74,16 @@ public class GeneratorMapper extends
       }
       if (filter && filters.filter(url) == null)
         return;
+      if ((sitemap && !URLFilters.isSitemap(page)) || !sitemap && URLFilters
+          .isSitemap(page))
+        return;
     } catch (URLFilterException e) {
       GeneratorJob.LOG
-          .warn("Couldn't filter url: {} ({})", url, e.getMessage());
+      .warn("Couldn't filter url: {} ({})", url, e.getMessage());
       return;
     } catch (MalformedURLException e) {
       GeneratorJob.LOG
-          .warn("Couldn't filter url: {} ({})", url, e.getMessage());
+      .warn("Couldn't filter url: {} ({})", url, e.getMessage());
       return;
     }
 
@@ -106,6 +110,7 @@ public class GeneratorMapper extends
     Configuration conf = context.getConfiguration();
     filter = conf.getBoolean(GeneratorJob.GENERATOR_FILTER, true);
     normalise = conf.getBoolean(GeneratorJob.GENERATOR_NORMALISE, true);
+    sitemap = conf.getBoolean(GeneratorJob.GENERATOR_SITEMAP, false);
     if (filter) {
       filters = new URLFilters(conf);
     }

Added: nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectType.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectType.java?rev=1726853&view=auto
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectType.java (added)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectType.java Tue Jan 26 19:19:02 2016
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.crawl;
+
+import org.apache.avro.util.Utf8;
+
+public enum InjectType {
+  INJECT("y"),
+  SITEMAP_INJECT("s");
+
+  Utf8 type;
+
+  private InjectType(String type) {
+    this.type = new Utf8(type);
+  }
+
+  public Utf8 getTypeString() {
+    return new Utf8(type);
+  }
+
+}
\ No newline at end of file

Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java Tue Jan 26 19:19:02 2016
@@ -65,8 +65,6 @@ public class InjectorJob extends NutchTo
 
   private static final Set<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
 
-  private static final Utf8 YES_STRING = new Utf8("y");
-
   static {
     FIELDS.add(WebPage.Field.MARKERS);
     FIELDS.add(WebPage.Field.STATUS);
@@ -79,8 +77,7 @@ public class InjectorJob extends NutchTo
    */
   public static String nutchFetchIntervalMDName = "nutch.fetchInterval";
 
-  public static class UrlMapper extends
-      Mapper<LongWritable, Text, String, WebPage> {
+  public static class UrlMapper extends Mapper<LongWritable, Text, String, WebPage> {
     private URLNormalizers urlNormalizers;
     private int interval;
     private float scoreInjected;
@@ -90,7 +87,7 @@ public class InjectorJob extends NutchTo
 
     @Override
     protected void setup(Context context) throws IOException,
-        InterruptedException {
+    InterruptedException {
       urlNormalizers = new URLNormalizers(context.getConfiguration(),
           URLNormalizers.SCOPE_INJECT);
       interval = context.getConfiguration().getInt("db.fetch.interval.default",
@@ -117,13 +114,26 @@ public class InjectorJob extends NutchTo
       float customScore = -1f;
       int customInterval = interval;
       Map<String, String> metadata = new TreeMap<String, String>();
+      InjectType injectType = InjectType.INJECT;
       if (url.indexOf("\t") != -1) {
         String[] splits = url.split("\t");
         url = splits[0];
         for (int s = 1; s < splits.length; s++) {
           // find separation between name and value
           int indexEquals = splits[s].indexOf("=");
-          if (indexEquals == -1) {
+          if (splits[s].indexOf("sitemaps:") > -1) {
+            String[] sitemaps = splits[s].trim().split(" ");
+            String sitemapUrl;
+            for (int i = 1; i < sitemaps.length; i++) {
+              sitemapUrl = url + sitemaps[i];
+              write(sitemapUrl, context, customInterval, customScore,
+                  new HashMap<String, String>(), InjectType.SITEMAP_INJECT);
+            }
+            continue;
+          } else if (splits[s].indexOf("-sitemap") == 0) {
+            injectType = InjectType.SITEMAP_INJECT;
+            continue;
+          } else if (indexEquals == -1) {
             // skip anything without a =
             continue;
           }
@@ -143,6 +153,12 @@ public class InjectorJob extends NutchTo
             metadata.put(metaname, metavalue);
         }
       }
+      write(url, context, customInterval, customScore, metadata, injectType);
+    }
+
+    private void write(String url, Context context, Integer customInterval,
+        Float customScore, Map<String, String> metadata, InjectType injectType)
+            throws IOException, InterruptedException {
       try {
         url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT);
         url = filters.filter(url); // filter the url
@@ -177,14 +193,13 @@ public class InjectorJob extends NutchTo
           scfilters.injectedScore(url, row);
         } catch (ScoringFilterException e) {
           if (LOG.isWarnEnabled()) {
-            LOG.warn("Cannot filter injected score for url " + url
-                + ", using default (" + e.getMessage() + ")");
+            LOG.warn("Cannot filter injected score for url {}, using default ({})", url, e.getMessage());
           }
         }
         context.getCounter("injector", "urls_injected").increment(1);
         row.getMarkers()
-            .put(DbUpdaterJob.DISTANCE, new Utf8(String.valueOf(0)));
-        Mark.INJECT_MARK.putMark(row, YES_STRING);
+        .put(DbUpdaterJob.DISTANCE, new Utf8(String.valueOf(0)));
+        Mark.INJECT_MARK.putMark(row, injectType.getTypeString());
         context.write(reversedUrl, row);
       }
     }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java Tue Jan 26 19:19:02 2016
@@ -35,6 +35,7 @@ import org.apache.hadoop.util.ToolRunner
 import org.apache.nutch.crawl.GeneratorJob;
 import org.apache.nutch.crawl.URLPartitioner.FetchEntryPartitioner;
 import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.parse.ParserJob;
 import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.storage.Mark;
@@ -61,6 +62,8 @@ public class FetcherJob extends NutchToo
   public static final Utf8 REDIRECT_DISCOVERED = new Utf8("___rdrdsc__");
 
   public static final String RESUME_KEY = "fetcher.job.resume";
+  public static final String SITEMAP = "fetcher.job.sitemap";
+  public static final String SITEMAP_DETECT = "fetcher.job.sitemap.detect";
   public static final String PARSE_KEY = "fetcher.parse";
   public static final String THREADS_KEY = "fetcher.threads.fetch";
 
@@ -90,7 +93,7 @@ public class FetcherJob extends NutchToo
    * </p>
    */
   public static class FetcherMapper extends
-      GoraMapper<String, WebPage, IntWritable, FetchEntry> {
+  GoraMapper<String, WebPage, IntWritable, FetchEntry> {
 
     private boolean shouldContinue;
 
@@ -112,17 +115,22 @@ public class FetcherJob extends NutchToo
       if (Mark.GENERATE_MARK.checkMark(page) == null) {
         if (LOG.isDebugEnabled()) {
           LOG.debug("Skipping " + TableUtil.unreverseUrl(key)
-              + "; not generated yet");
+          + "; not generated yet");
         }
         return;
       }
       if (shouldContinue && Mark.FETCH_MARK.checkMark(page) != null) {
         if (LOG.isDebugEnabled()) {
           LOG.debug("Skipping " + TableUtil.unreverseUrl(key)
-              + "; already fetched");
+          + "; already fetched");
         }
         return;
       }
+      boolean sitemap = context.getConfiguration().getBoolean(SITEMAP, false);
+
+      if ((sitemap && !URLFilters.isSitemap(page)) || !sitemap && URLFilters
+          .isSitemap(page))
+        return;
       context.write(new IntWritable(random.nextInt(65536)), new FetchEntry(
           context.getConfiguration(), key, page));
     }
@@ -158,6 +166,8 @@ public class FetcherJob extends NutchToo
     Integer threads = (Integer) args.get(Nutch.ARG_THREADS);
     Boolean shouldResume = (Boolean) args.get(Nutch.ARG_RESUME);
     Integer numTasks = (Integer) args.get(Nutch.ARG_NUMTASKS);
+    Boolean stmDetect = (Boolean) args.get(Nutch.ARG_SITEMAP_DETECT);
+    Boolean sitemap = (Boolean) args.get(Nutch.ARG_SITEMAP);
 
     if (threads != null && threads > 0) {
       getConf().setInt(THREADS_KEY, threads);
@@ -169,10 +179,16 @@ public class FetcherJob extends NutchToo
     if (shouldResume != null) {
       getConf().setBoolean(RESUME_KEY, shouldResume);
     }
+    if (stmDetect != null) {
+      getConf().setBoolean(SITEMAP_DETECT, stmDetect);
+    }
+    if (sitemap != null) {
+      getConf().setBoolean(SITEMAP, sitemap);
+    }
 
-    LOG.info("FetcherJob: threads: " + getConf().getInt(THREADS_KEY, 10));
-    LOG.info("FetcherJob: parsing: " + getConf().getBoolean(PARSE_KEY, false));
-    LOG.info("FetcherJob: resuming: " + getConf().getBoolean(RESUME_KEY, false));
+    LOG.info("FetcherJob: threads: {}", getConf().getInt(THREADS_KEY, 10));
+    LOG.info("FetcherJob: parsing: {}", getConf().getBoolean(PARSE_KEY, false));
+    LOG.info("FetcherJob: resuming: {}", getConf().getBoolean(RESUME_KEY, false));
 
     // set the actual time for the timelimit relative
     // to the beginning of the whole job and not of a specific task
@@ -182,8 +198,7 @@ public class FetcherJob extends NutchToo
       timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
       getConf().setLong("fetcher.timelimit", timelimit);
     }
-    LOG.info("FetcherJob : timelimit set for : "
-        + getConf().getLong("fetcher.timelimit", -1));
+    LOG.info("FetcherJob : timelimit set for : {}", getConf().getLong("fetcher.timelimit", -1));
     numJobs = 1;
     currentJob = NutchJob.getInstance(getConf(), "fetch");
 
@@ -237,6 +252,31 @@ public class FetcherJob extends NutchToo
    */
   public int fetch(String batchId, int threads, boolean shouldResume,
       int numTasks) throws Exception {
+    return fetch(batchId, threads, shouldResume, numTasks, false, false);
+  }
+
+  /**
+   * Run fetcher.
+   *
+   * @param batchId
+   *          batchId (obtained from Generator) or null to fetch all generated
+   *          fetchlists
+   * @param threads
+   *          number of threads per map task
+   * @param shouldResume
+   * @param numTasks
+   *          number of fetching tasks (reducers). If set to < 1 then use the
+   *          default, which is mapred.map.tasks.
+   * @param stmDetect
+   *          If set true, sitemap detection is run.
+   * @param sitemap
+   *          If set true, only sitemap files is fetched, If set false, only
+   *          normal urls is fetched.
+   * @return 0 on success
+   * @throws Exception
+   */
+  public int fetch(String batchId, int threads, boolean shouldResume,
+      int numTasks, boolean stmDetect, boolean sitemap) throws Exception {
 
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();
@@ -249,11 +289,12 @@ public class FetcherJob extends NutchToo
     }
 
     run(ToolUtil.toArgMap(Nutch.ARG_BATCH, batchId, Nutch.ARG_THREADS, threads,
-        Nutch.ARG_RESUME, shouldResume, Nutch.ARG_NUMTASKS, numTasks));
+        Nutch.ARG_RESUME, shouldResume, Nutch.ARG_NUMTASKS, numTasks,
+        Nutch.ARG_SITEMAP_DETECT, stmDetect, Nutch.ARG_SITEMAP, sitemap));
 
     long finish = System.currentTimeMillis();
     LOG.info("FetcherJob: finished at " + sdf.format(finish)
-        + ", time elapsed: " + TimingUtil.elapsedTime(start, finish));
+    + ", time elapsed: " + TimingUtil.elapsedTime(start, finish));
 
     return 0;
   }
@@ -275,6 +316,7 @@ public class FetcherJob extends NutchToo
   public int run(String[] args) throws Exception {
     int threads = -1;
     boolean shouldResume = false;
+    boolean stmRobot = false, sitemap = false;
     String batchId;
 
     String usage = "Usage: FetcherJob (<batchId> | -all) [-crawlId <id>] "
@@ -283,7 +325,9 @@ public class FetcherJob extends NutchToo
         + "    -crawlId <id> - the id to prefix the schemas to operate on, \n \t \t    (default: storage.crawl.id)\n"
         + "    -threads N    - number of fetching threads per task\n"
         + "    -resume       - resume interrupted job\n"
-        + "    -numTasks N   - if N > 0 then use this many reduce tasks for fetching \n \t \t    (default: mapred.map.tasks)";
+        + "    -numTasks N   - if N > 0 then use this many reduce tasks for fetching \n \t \t    (default: mapred.map.tasks)"
+        + "    -sitemap      - only sitemap files are fetched, defaults to false"
+        + "    -stmDetect    - sitemap files are detected from robot.txt file";
 
     if (args.length == 0) {
       System.err.println(usage);
@@ -306,13 +350,17 @@ public class FetcherJob extends NutchToo
         numTasks = Integer.parseInt(args[++i]);
       } else if ("-crawlId".equals(args[i])) {
         getConf().set(Nutch.CRAWL_ID_KEY, args[++i]);
+      } else if ("-sitemap".equals(args[i])) {
+        sitemap = true;
+      } else if ("-stmDetect".equals(args[i])) {
+        stmRobot = true;
       } else {
         throw new IllegalArgumentException("arg " + args[i] + " not recognized");
       }
     }
 
-    int fetchcode = fetch(batchId, threads, shouldResume, numTasks); // run the
-                                                                     // Fetcher
+    int fetchcode = fetch(batchId, threads, shouldResume, numTasks, stmRobot,
+        sitemap); // run the Fetcher
 
     return fetchcode;
   }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java Tue Jan 26 19:19:02 2016
@@ -491,7 +491,8 @@ public class FetcherReducer extends
           }
           try {
             LOG.info("fetching " + fit.url + " (queue crawl delay="
-                + fetchQueues.getFetchItemQueue(fit.queueID).crawlDelay + "ms)");
+                + fetchQueues.getFetchItemQueue(fit.queueID).crawlDelay
+                + "ms)");
 
             // fetch the page
             final Protocol protocol = this.protocolFactory.getProtocol(fit.url);
@@ -527,6 +528,19 @@ public class FetcherReducer extends
                 }
               }
             }
+
+            boolean stmRobot = context.getConfiguration().getBoolean(FetcherJob.SITEMAP_DETECT, false);
+
+            if (stmRobot && (fit.u.getFile() == null
+                || fit.u.getFile().length() == 0 || (
+                fit.u.getFile().length() == 1 && fit.u.getFile().equals(
+                    "/")))) {
+              for (String stmUrl : rules.getSitemaps()) {
+                fit.page.getSitemaps()
+                    .put(new Utf8(stmUrl), new Utf8());
+              }
+            }
+
             final ProtocolOutput output = protocol.getProtocolOutput(fit.url,
                 fit.page);
             final ProtocolStatus status = output.getStatus();
@@ -806,7 +820,13 @@ public class FetcherReducer extends
     parse = conf.getBoolean(FetcherJob.PARSE_KEY, false);
     storingContent = conf.getBoolean("fetcher.store.content", true);
     if (parse) {
-      skipTruncated = conf.getBoolean(ParserJob.SKIP_TRUNCATED, true);
+      boolean sitemap = conf.getBoolean(FetcherJob.SITEMAP, false);
+
+      if (sitemap) {
+        skipTruncated = false;
+      } else {
+        skipTruncated = conf.getBoolean(ParserJob.SKIP_TRUNCATED, true);
+      }
       parseUtil = new ParseUtil(conf);
     }
     LOG.info("Fetcher: threads: " + threadCount);

Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/Metadata.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/Metadata.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/metadata/Metadata.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/Metadata.java Tue Jan 26 19:19:02 2016
@@ -19,10 +19,7 @@ package org.apache.nutch.metadata;
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
-import java.util.Enumeration;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Properties;
+import java.util.*;
 
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.io.Writable;
@@ -97,6 +94,15 @@ public class Metadata implements Writabl
     return _getValues(name);
   }
 
+  /**
+   * Get the metadata list
+   *
+   * @return the values associated to a metadata name.
+   */
+  public Set<Map.Entry<String, String[]>> getMetaData() {
+    return metadata.entrySet();
+  }
+
   private String[] _getValues(final String name) {
     String[] values = metadata.get(name);
     if (values == null) {

Modified: nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/metadata/Nutch.java Tue Jan 26 19:19:02 2016
@@ -112,6 +112,10 @@ public interface Nutch {
   public static final String ARG_CLASS = "class";
   /** Depth (number of cycles) of a crawl. */
   public static final String ARG_DEPTH = "depth";
+  /** Sitemaps. */
+  public static final String ARG_SITEMAP = "sitemap";
+  /** Sitemap Detect as fetch . */
+  public static final String ARG_SITEMAP_DETECT = "stmDetect";
 
   // short constants for status / results fields
   /** Status / result message. */

Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilters.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilters.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilters.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/net/URLFilters.java Tue Jan 26 19:19:02 2016
@@ -21,10 +21,13 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Map;
 
+import org.apache.nutch.crawl.InjectType;
 import org.apache.nutch.plugin.Extension;
 import org.apache.nutch.plugin.ExtensionPoint;
 import org.apache.nutch.plugin.PluginRuntimeException;
 import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.storage.Mark;
+import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.ObjectCache;
 
 import org.apache.hadoop.conf.Configuration;
@@ -93,4 +96,17 @@ public class URLFilters {
     }
     return urlString;
   }
+
+  /**
+   * If the page is a sitemap, return true
+   *
+   * */
+  public static boolean isSitemap(WebPage page) {
+    if (InjectType.SITEMAP_INJECT.getTypeString().equals(
+        Mark.INJECT_MARK.checkMark(page))) {
+      return true;
+    } else {
+      return false;
+    }
+  }
 }

Added: nutch/branches/2.x/src/java/org/apache/nutch/parse/NutchSitemapParse.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/NutchSitemapParse.java?rev=1726853&view=auto
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/NutchSitemapParse.java (added)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/NutchSitemapParse.java Tue Jan 26 19:19:02 2016
@@ -0,0 +1,55 @@
+/**
+ * ****************************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * <p/>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p/>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ****************************************************************************
+ */
+package org.apache.nutch.parse;
+
+import org.apache.nutch.metadata.Metadata;
+
+import java.util.List;
+import java.util.Map;
+
+public class NutchSitemapParse {
+
+  private Map<Outlink, Metadata> outlinkMap;
+  private org.apache.nutch.storage.ParseStatus parseStatus;
+
+  public NutchSitemapParse() {
+  }
+
+  public NutchSitemapParse(Map<Outlink, Metadata> outlinkMap,
+      org.apache.nutch.storage.ParseStatus parseStatus) {
+    this.outlinkMap = outlinkMap;
+    this.parseStatus = parseStatus;
+  }
+
+  public Map<Outlink, Metadata> getOutlinkMap() {
+    return outlinkMap;
+  }
+
+  public org.apache.nutch.storage.ParseStatus getParseStatus() {
+    return parseStatus;
+  }
+
+  public void setOutlinks(Map<Outlink, Metadata> outlinkMap) {
+    this.outlinkMap = outlinkMap;
+  }
+
+  public void setParseStatus(org.apache.nutch.storage.ParseStatus parseStatus) {
+    this.parseStatus = parseStatus;
+  }
+}

Added: nutch/branches/2.x/src/java/org/apache/nutch/parse/NutchSitemapParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/NutchSitemapParser.java?rev=1726853&view=auto
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/NutchSitemapParser.java (added)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/NutchSitemapParser.java Tue Jan 26 19:19:02 2016
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.*;
+
+import crawlercommons.sitemaps.*;
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+
+import org.apache.nutch.storage.ParseStatus;
+import org.apache.nutch.storage.WebPage;
+
+public class NutchSitemapParser {
+
+  private Configuration conf;
+
+  private static Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
+
+  static {
+    FIELDS.add(WebPage.Field.BASE_URL);
+  }
+
+  public NutchSitemapParse getParse(String url, WebPage page) {
+    NutchSitemapParse nutchSitemapParse = null;
+    SiteMapParser parser = new SiteMapParser();
+
+    AbstractSiteMap siteMap = null;
+    String contentType = page.getContentType().toString();
+    try {
+      siteMap = parser
+          .parseSiteMap(contentType, page.getContent().array(),
+              new URL(url));
+    } catch (UnknownFormatException e) {
+      e.printStackTrace();
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+    Map<Outlink, Metadata> outlinkMap = null;
+    Iterator i$;
+    if (siteMap.isIndex()) {
+      Collection<AbstractSiteMap> links = ((SiteMapIndex) siteMap)
+          .getSitemaps();
+      for (AbstractSiteMap siteMapIndex : links) {
+        page.getSitemaps().put(new Utf8(siteMapIndex.getUrl().toString()),
+            new Utf8("parser"));
+      }
+
+    } else {
+      Collection<SiteMapURL> links = ((SiteMap) siteMap).getSiteMapUrls();
+      outlinkMap = new HashMap<Outlink, Metadata>();
+
+      for (SiteMapURL sitemapUrl : links) {
+        Metadata metadata = new Metadata();
+        metadata
+            .add("changeFrequency", sitemapUrl.getChangeFrequency().name());
+        metadata.add("lastModified", Long.toString(
+            sitemapUrl.getLastModified().getTime()));
+        metadata.add("priority", Double.toString(sitemapUrl.getPriority()));
+        try {
+          outlinkMap.put(
+              new Outlink(sitemapUrl.getUrl().toString(), "sitemap.outlink"),
+              metadata);
+        } catch (MalformedURLException e) {
+          e.printStackTrace();
+        }
+      }
+    }
+    ParseStatus status = ParseStatus.newBuilder().build();
+    status.setMajorCode((int) ParseStatusCodes.SUCCESS);
+    nutchSitemapParse = new NutchSitemapParse(outlinkMap, status);
+    return nutchSitemapParse;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public Collection<WebPage.Field> getFields() {
+    return FIELDS;
+  }
+}

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java Tue Jan 26 19:19:02 2016
@@ -22,24 +22,31 @@ import com.google.common.util.concurrent
 import org.apache.avro.util.Utf8;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.nutch.crawl.CrawlStatus;
+import org.apache.nutch.crawl.InjectType;
 import org.apache.nutch.crawl.Signature;
 import org.apache.nutch.crawl.SignatureFactory;
 import org.apache.nutch.fetcher.FetcherJob;
+import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.URLFilterException;
 import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.net.URLNormalizers;
 import org.apache.nutch.storage.Mark;
+import org.apache.nutch.storage.ParseStatus;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.TableUtil;
 import org.apache.nutch.util.URLUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.io.IOException;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.nio.ByteBuffer;
+import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
@@ -56,6 +63,9 @@ import java.util.concurrent.TimeUnit;
  */
 public class ParseUtil extends Configured {
 
+  public enum ChangeFrequency {
+    ALWAYS, HOURLY, DAILY, WEEKLY, MONTHLY, YEARLY, NEVER
+  }
   /* our log stream */
   public static final Logger LOG = LoggerFactory.getLogger(ParseUtil.class);
 
@@ -90,7 +100,12 @@ public class ParseUtil extends Configure
   public void setConf(Configuration conf) {
     this.conf = conf;
     parserFactory = new ParserFactory(conf);
-    maxParseTime = conf.getInt("parser.timeout", DEFAULT_MAX_PARSE_TIME);
+    if (conf.getBoolean("parse.sitemap", false)) {
+      maxParseTime = conf.getInt("parser.timeout", DEFAULT_MAX_PARSE_TIME);
+    } else {
+      maxParseTime = conf
+          .getInt("sitemap.parser.timeout", DEFAULT_MAX_PARSE_TIME);
+    }
     sig = SignatureFactory.getSignature(conf);
     filters = new URLFilters(conf);
     normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
@@ -113,25 +128,15 @@ public class ParseUtil extends Configure
    * @throws ParseException
    *           If there is an error parsing.
    */
-  public Parse parse(String url, WebPage page) throws ParserNotFound,
-      ParseException {
+  public Parse parse(String url, WebPage page) throws ParseException {
     Parser[] parsers = null;
+    Parse parse = null;
 
     String contentType = TableUtil.toString(page.getContentType());
-
     parsers = this.parserFactory.getParsers(contentType, url);
 
     for (int i = 0; i < parsers.length; i++) {
-      if (LOG.isDebugEnabled()) {
-        LOG.debug("Parsing [" + url + "] with [" + parsers[i] + "]");
-      }
-      Parse parse = null;
-
-      if (maxParseTime != -1)
-        parse = runParser(parsers[i], url, page);
-      else
-        parse = parsers[i].getParse(url, page);
-
+      parse = parse(url, page, parsers[i]);
       if (parse != null && ParseStatusUtils.isSuccess(parse.getParseStatus())) {
         return parse;
       }
@@ -143,6 +148,17 @@ public class ParseUtil extends Configure
         "Unable to successfully parse content"), null);
   }
 
+  private Parse parse(String url, WebPage page, Parser parser) {
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("Parsing [" + url + "] with [" + parser + "]");
+    }
+    if (maxParseTime != -1) {
+      return runParser(parser, url, page);
+    } else {
+      return parser.getParse(url, page);
+    }
+  }
+
   private Parse runParser(Parser p, String url, WebPage page) {
     ParseCallable pc = new ParseCallable(p, page, url);
     Future<Parse> task = executorService.submit(pc);
@@ -158,24 +174,157 @@ public class ParseUtil extends Configure
     return res;
   }
 
-  /**
-   * Parses given web page and stores parsed content within page. Puts a
-   * meta-redirect to outlinks.
-   * 
-   * @param key
-   * @param page
-   */
-  public void process(String key, WebPage page) {
-    String url = TableUtil.unreverseUrl(key);
+  public boolean status(String url, WebPage page) {
     byte status = page.getStatus().byteValue();
     if (status != CrawlStatus.STATUS_FETCHED) {
       if (LOG.isDebugEnabled()) {
         LOG.debug("Skipping " + url + " as status is: "
             + CrawlStatus.getName(status));
       }
+      return true;
+    }
+    return false;
+  }
+
+  /**
+   * Parses given sitemap page and stores parsed content within page.
+   *
+   */
+  public void processSitemapParse(String url, WebPage page,
+      Mapper.Context context) {
+    if (status(url, page)) {
       return;
     }
 
+    NutchSitemapParser sParser = new NutchSitemapParser();
+    NutchSitemapParse nutchSitemapParse = sParser.getParse(url, page);
+
+    if (nutchSitemapParse == null) {
+      return;
+    }
+
+    ParseStatus pstatus = nutchSitemapParse.getParseStatus();
+    page.setParseStatus(pstatus);
+    if (ParseStatusUtils.isSuccess(pstatus)) {
+      final Map<Outlink, Metadata> outlinkMap = nutchSitemapParse
+          .getOutlinkMap();
+      if (pstatus.getMinorCode() == ParseStatusCodes.SUCCESS_REDIRECT) {
+        successRedirect(url, page, pstatus);
+      } else if (outlinkMap != null) {
+        Set<Outlink> outlinks = outlinkMap.keySet();
+        setSignature(page);
+
+        for (Outlink outlink : outlinks) {
+          String toUrl = outlink.getToUrl();
+
+          try {
+            toUrl = normalizers.normalize(toUrl, URLNormalizers.SCOPE_OUTLINK);
+            toUrl = filters.filter(toUrl);
+          } catch (MalformedURLException e2) {
+            return;
+          } catch (URLFilterException e) {
+            return;
+          }
+          if (toUrl == null) {
+            return;
+          }
+          String reversedUrl = null;
+          try {
+            reversedUrl = TableUtil.reverseUrl(toUrl); // collect it
+          } catch (MalformedURLException e) {
+            e.printStackTrace();
+          }
+          WebPage newRow = WebPage.newBuilder().build();
+          Set<Map.Entry<String, String[]>> metaDatas = outlinkMap.get(outlink)
+              .getMetaData();
+          for (Map.Entry<String, String[]> metadata : metaDatas) {
+            System.out.println();
+            newRow.getMetadata().put(new Utf8(metadata.getKey()),
+                ByteBuffer.wrap(metadata.getValue()[0].getBytes()));
+          }
+
+          int changeFrequency = calculateFetchInterval(
+              outlinkMap.get(outlink).get("changeFrequency"));
+          String modifiedTime = outlinkMap.get(outlink).get("lastModified");
+
+          newRow.setFetchInterval(changeFrequency);
+          newRow.setModifiedTime(Long.valueOf(modifiedTime));
+          newRow.setStmPriority(
+              Float.parseFloat(outlinkMap.get(outlink).get("priority")));
+
+          Mark.INJECT_MARK.putMark(newRow, InjectType.SITEMAP_INJECT.getTypeString());
+
+          try {
+            context.write(reversedUrl, newRow);
+          } catch (IOException e) {
+            e.printStackTrace();
+          } catch (InterruptedException e) {
+            e.printStackTrace();
+          }
+        }
+
+        parseMark(page);
+      }
+    }
+
+  }
+
+  private int calculateFetchInterval(String changeFrequency) {
+    if (changeFrequency.equals(ChangeFrequency.ALWAYS.toString())
+        || changeFrequency.equals(ChangeFrequency.HOURLY.toString())) {
+      return 3600; // 60 * 60
+    } else if (changeFrequency.equals(ChangeFrequency.DAILY.toString())) {
+      return 86400; // 24 * 60 * 60
+    } else if (changeFrequency.equals(ChangeFrequency.WEEKLY.toString())) {
+      return 604800; // 7 * 24 * 60 * 60
+    } else if (changeFrequency.equals(ChangeFrequency.MONTHLY.toString())) {
+      return 2628000; // average seconds in one month
+    } else if (changeFrequency.equals(ChangeFrequency.YEARLY.toString())
+        || changeFrequency.equals(ChangeFrequency.NEVER.toString())) {
+      return 31536000; // average seconds in one year
+    } else {
+      return Integer.MAX_VALUE; // other intervals are larger than Integer.MAX_VALUE
+    }
+  }
+
+  private void parseMark(WebPage page) {
+    Utf8 fetchMark = Mark.FETCH_MARK.checkMark(page);
+    if (fetchMark != null) {
+      Mark.PARSE_MARK.putMark(page, fetchMark);
+    }
+  }
+
+  private void putOutlink(WebPage page, Outlink outlink, String toUrl) {
+    try {
+      toUrl = normalizers.normalize(toUrl, URLNormalizers.SCOPE_OUTLINK);
+      toUrl = filters.filter(toUrl);
+    } catch (MalformedURLException e2) {
+      return;
+    } catch (URLFilterException e) {
+      return;
+    }
+    if (toUrl == null) {
+      return;
+    }
+    Utf8 utf8ToUrl = new Utf8(toUrl);
+    if (page.getOutlinks().get(utf8ToUrl) != null) {
+      // skip duplicate outlinks
+      return;
+    }
+    page.getOutlinks().put(utf8ToUrl, new Utf8(outlink.getAnchor()));
+  }
+
+  /**
+   * Parses given web page and stores parsed content within page. Puts a
+   * meta-redirect to outlinks.
+   *
+   * @param url
+   * @param page
+   */
+  public void process(String url, WebPage page) {
+    if (status(url, page)) {
+      return;
+    }
     Parse parse;
     try {
       parse = parse(url, page);
@@ -193,58 +342,20 @@ public class ParseUtil extends Configure
       return;
     }
 
-    org.apache.nutch.storage.ParseStatus pstatus = parse.getParseStatus();
+    ParseStatus pstatus = parse.getParseStatus();
     page.setParseStatus(pstatus);
     if (ParseStatusUtils.isSuccess(pstatus)) {
       if (pstatus.getMinorCode() == ParseStatusCodes.SUCCESS_REDIRECT) {
-        String newUrl = ParseStatusUtils.getMessage(pstatus);
-        int refreshTime = Integer.parseInt(ParseStatusUtils.getArg(pstatus, 1));
-        try {
-          newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
-          if (newUrl == null) {
-            LOG.warn("redirect normalized to null " + url);
-            return;
-          }
-          try {
-            newUrl = filters.filter(newUrl);
-          } catch (URLFilterException e) {
-            return;
-          }
-          if (newUrl == null) {
-            LOG.warn("redirect filtered to null " + url);
-            return;
-          }
-        } catch (MalformedURLException e) {
-          LOG.warn("malformed url exception parsing redirect " + url);
-          return;
-        }
-        page.getOutlinks().put(new Utf8(newUrl), new Utf8());
-        page.getMetadata().put(FetcherJob.REDIRECT_DISCOVERED,
-            TableUtil.YES_VAL);
-        if (newUrl == null || newUrl.equals(url)) {
-          String reprUrl = URLUtil.chooseRepr(url, newUrl,
-              refreshTime < FetcherJob.PERM_REFRESH_TIME);
-          if (reprUrl == null) {
-            LOG.warn("reprUrl==null for " + url);
-            return;
-          } else {
-            page.setReprUrl(new Utf8(reprUrl));
-          }
-        }
+        successRedirect(url, page, pstatus);
       } else {
         page.setText(new Utf8(parse.getText()));
         page.setTitle(new Utf8(parse.getTitle()));
-        ByteBuffer prevSig = page.getSignature();
-        if (prevSig != null) {
-          page.setPrevSignature(prevSig);
-        }
-        final byte[] signature = sig.calculate(page);
-        page.setSignature(ByteBuffer.wrap(signature));
+
+        setSignature(page);
+
         if (page.getOutlinks() != null) {
           page.getOutlinks().clear();
         }
-        final Outlink[] outlinks = parse.getOutlinks();
-        int outlinksToStore = Math.min(maxOutlinks, outlinks.length);
         String fromHost;
         if (ignoreExternalLinks) {
           try {
@@ -257,24 +368,11 @@ public class ParseUtil extends Configure
         }
         int validCount = 0;
 
-        for (int i = 0; validCount < outlinksToStore && i < outlinks.length; i++) {
+        final Outlink[] outlinks = parse.getOutlinks();
+        int outlinksToStore = Math.min(maxOutlinks, outlinks.length);
+        for (int i = 0; validCount < outlinksToStore
+            && i < outlinks.length; i++, validCount++) {
           String toUrl = outlinks[i].getToUrl();
-          try {
-            toUrl = normalizers.normalize(toUrl, URLNormalizers.SCOPE_OUTLINK);
-            toUrl = filters.filter(toUrl);
-          } catch (MalformedURLException e2) {
-            continue;
-          } catch (URLFilterException e) {
-            continue;
-          }
-          if (toUrl == null) {
-            continue;
-          }
-          Utf8 utf8ToUrl = new Utf8(toUrl);
-          if (page.getOutlinks().get(utf8ToUrl) != null) {
-            // skip duplicate outlinks
-            continue;
-          }
           String toHost;
           if (ignoreExternalLinks) {
             try {
@@ -286,14 +384,56 @@ public class ParseUtil extends Configure
               continue; // skip it
             }
           }
-          validCount++;
-          page.getOutlinks().put(utf8ToUrl, new Utf8(outlinks[i].getAnchor()));
-        }
-        Utf8 fetchMark = Mark.FETCH_MARK.checkMark(page);
-        if (fetchMark != null) {
-          Mark.PARSE_MARK.putMark(page, fetchMark);
+          putOutlink(page, outlinks[i], toUrl);
         }
+        parseMark(page);
+      }
+    }
+  }
+
+  private void successRedirect(String url, WebPage page, ParseStatus pstatus) {
+    String newUrl = ParseStatusUtils.getMessage(pstatus);
+    int refreshTime = Integer.parseInt(ParseStatusUtils.getArg(pstatus, 1));
+    try {
+      newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
+      if (newUrl == null) {
+        LOG.warn("redirect normalized to null " + url);
+        return;
+      }
+      try {
+        newUrl = filters.filter(newUrl);
+      } catch (URLFilterException e) {
+        return;
+      }
+      if (newUrl == null) {
+        LOG.warn("redirect filtered to null " + url);
+        return;
       }
+    } catch (MalformedURLException e) {
+      LOG.warn("malformed url exception parsing redirect " + url);
+      return;
+    }
+    page.getOutlinks().put(new Utf8(newUrl), new Utf8());
+    page.getMetadata().put(FetcherJob.REDIRECT_DISCOVERED,
+        TableUtil.YES_VAL);
+    if (newUrl == null || newUrl.equals(url)) {
+      String reprUrl = URLUtil.chooseRepr(url, newUrl,
+          refreshTime < FetcherJob.PERM_REFRESH_TIME);
+      if (reprUrl == null) {
+        LOG.warn("reprUrl==null for " + url);
+        return;
+      } else {
+        page.setReprUrl(new Utf8(reprUrl));
+      }
+    }
+  }
+
+  private void setSignature(WebPage page) {
+    ByteBuffer prevSig = page.getSignature();
+    if (prevSig != null) {
+      page.setPrevSignature(prevSig);
     }
+    final byte[] signature = sig.calculate(page);
+    page.setSignature(ByteBuffer.wrap(signature));
   }
 }

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java Tue Jan 26 19:19:02 2016
@@ -35,6 +35,7 @@ import org.apache.nutch.crawl.GeneratorJ
 import org.apache.nutch.crawl.SignatureFactory;
 import org.apache.nutch.metadata.HttpHeaders;
 import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.storage.Mark;
 import org.apache.nutch.storage.ParseStatus;
 import org.apache.nutch.storage.StorageUtils;
@@ -61,6 +62,8 @@ public class ParserJob extends NutchTool
 
   private static final Utf8 REPARSE = new Utf8("-reparse");
 
+  private static String SITEMAP_PARSE = "parse.sitemap";
+
   private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
 
   private Configuration conf;
@@ -75,16 +78,19 @@ public class ParserJob extends NutchTool
     FIELDS.add(WebPage.Field.OUTLINKS);
     FIELDS.add(WebPage.Field.METADATA);
     FIELDS.add(WebPage.Field.HEADERS);
+    FIELDS.add(WebPage.Field.SITEMAPS);
+    FIELDS.add(WebPage.Field.STM_PRIORITY);
   }
 
-  public static class ParserMapper extends
-      GoraMapper<String, WebPage, String, WebPage> {
+  public static class ParserMapper extends GoraMapper<String, WebPage, String, WebPage> {
     private ParseUtil parseUtil;
 
     private boolean shouldResume;
 
     private boolean force;
 
+    private boolean sitemap;
+
     private Utf8 batchId;
 
     private boolean skipTruncated;
@@ -95,9 +101,15 @@ public class ParserJob extends NutchTool
       parseUtil = new ParseUtil(conf);
       shouldResume = conf.getBoolean(RESUME_KEY, false);
       force = conf.getBoolean(FORCE_KEY, false);
+      sitemap = conf.getBoolean(SITEMAP_PARSE, false);
       batchId = new Utf8(
           conf.get(GeneratorJob.BATCH_ID, Nutch.ALL_BATCH_ID_STR));
       skipTruncated = conf.getBoolean(SKIP_TRUNCATED, true);
+      if (sitemap) {
+        skipTruncated = false;
+      } else {
+        skipTruncated = conf.getBoolean(SKIP_TRUNCATED, true);
+      }
     }
 
     @Override
@@ -109,8 +121,7 @@ public class ParserJob extends NutchTool
       } else {
         if (Mark.FETCH_MARK.checkMark(page) == null) {
           if (LOG.isDebugEnabled()) {
-            LOG.debug("Skipping " + TableUtil.unreverseUrl(key)
-                + "; not fetched yet");
+            LOG.debug("Skipping {}, not fetched yet", unreverseKey);
           }
           return;
         }
@@ -130,7 +141,12 @@ public class ParserJob extends NutchTool
         return;
       }
 
-      parseUtil.process(key, page);
+      if (sitemap && URLFilters.isSitemap(page)) {
+        LOG.info("Parsing for sitemap"); //TODO this log should be top line
+        parseUtil.processSitemapParse(unreverseKey, page, context);
+      } else {
+        parseUtil.process(unreverseKey, page);
+      }
       ParseStatus pstatus = page.getParseStatus();
       if (pstatus != null) {
         context.getCounter("ParserStatus",
@@ -230,6 +246,7 @@ public class ParserJob extends NutchTool
     String batchId = (String) args.get(Nutch.ARG_BATCH);
     Boolean shouldResume = (Boolean) args.get(Nutch.ARG_RESUME);
     Boolean force = (Boolean) args.get(Nutch.ARG_FORCE);
+    Boolean sitemap = (Boolean) args.get(Nutch.ARG_SITEMAP);
 
     if (batchId != null) {
       getConf().set(GeneratorJob.BATCH_ID, batchId);
@@ -240,13 +257,15 @@ public class ParserJob extends NutchTool
     if (force != null) {
       getConf().setBoolean(FORCE_KEY, force);
     }
-    LOG.info("ParserJob: resuming:\t" + getConf().getBoolean(RESUME_KEY, false));
-    LOG.info("ParserJob: forced reparse:\t"
-        + getConf().getBoolean(FORCE_KEY, false));
+    if (sitemap != null) {
+      getConf().setBoolean(SITEMAP_PARSE, sitemap);
+    }
+    LOG.info("ParserJob: resuming:\t{}", getConf().getBoolean(RESUME_KEY, false));
+    LOG.info("ParserJob: forced reparse:\t {}", getConf().getBoolean(FORCE_KEY, false));
     if (batchId == null || batchId.equals(Nutch.ALL_BATCH_ID_STR)) {
       LOG.info("ParserJob: parsing all");
     } else {
-      LOG.info("ParserJob: batchId:\t" + batchId);
+      LOG.info("ParserJob: batchId:\t{}", batchId);
     }
     currentJob = NutchJob.getInstance(getConf(), "parse");
 
@@ -278,39 +297,48 @@ public class ParserJob extends NutchTool
 
   public int parse(String batchId, boolean shouldResume, boolean force)
       throws Exception {
+    return parse(batchId, shouldResume, force, false);
+  }
+
+  public int parse(String batchId, boolean shouldResume, boolean force,
+      boolean sitemap)
+          throws Exception {
 
     SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
     long start = System.currentTimeMillis();
-    LOG.info("ParserJob: starting at " + sdf.format(start));
+    LOG.info("ParserJob: starting at {}", sdf.format(start));
 
     run(ToolUtil.toArgMap(Nutch.ARG_BATCH, batchId, Nutch.ARG_RESUME,
-        shouldResume, Nutch.ARG_FORCE, force));
+        shouldResume, Nutch.ARG_FORCE, force, Nutch.ARG_SITEMAP, sitemap));
     LOG.info("ParserJob: success");
 
     long finish = System.currentTimeMillis();
-    LOG.info("ParserJob: finished at " + sdf.format(finish)
-        + ", time elapsed: " + TimingUtil.elapsedTime(start, finish));
+    LOG.info("ParserJob: finished at {}, time elapsed: {}", 
+        sdf.format(finish), TimingUtil.elapsedTime(start, finish));
     return 0;
   }
 
   public int run(String[] args) throws Exception {
     boolean shouldResume = false;
     boolean force = false;
+    boolean sitemap = false;
     String batchId = null;
 
     if (args.length < 1) {
       System.err
-          .println("Usage: ParserJob (<batchId> | -all) [-crawlId <id>] [-resume] [-force]");
+      .println("Usage: ParserJob (<batchId> | -all) [-crawlId <id>] [-resume] [-force] [-sitemap]");
+      System.err
+      .println("    <batchId>     - symbolic batch ID created by Generator");
       System.err
-          .println("    <batchId>     - symbolic batch ID created by Generator");
+      .println("    -crawlId <id> - the id to prefix the schemas to operate on, \n \t \t    (default: storage.crawl.id)");
       System.err
-          .println("    -crawlId <id> - the id to prefix the schemas to operate on, \n \t \t    (default: storage.crawl.id)");
+      .println("    -all          - consider pages from all crawl jobs");
       System.err
-          .println("    -all          - consider pages from all crawl jobs");
+      .println("    -sitemap      - parse only sitemap pages, default false");
       System.err
-          .println("    -resume       - resume a previous incomplete job");
+      .println("    -resume       - resume a previous incomplete job");
       System.err
-          .println("    -force        - force re-parsing even if a page is already parsed");
+      .println("    -force        - force re-parsing even if a page is already parsed");
       return -1;
     }
     for (int i = 0; i < args.length; i++) {
@@ -322,6 +350,8 @@ public class ParserJob extends NutchTool
         getConf().set(Nutch.CRAWL_ID_KEY, args[++i]);
       } else if ("-all".equals(args[i])) {
         batchId = args[i];
+      } else if ("-sitemap".equals(args[i])) {
+        sitemap = true;
       } else {
         if (batchId != null) {
           System.err.println("BatchId already set to '" + batchId + "'!");
@@ -334,7 +364,7 @@ public class ParserJob extends NutchTool
       System.err.println("BatchId not set (or -all/-reparse not specified)!");
       return -1;
     }
-    return parse(batchId, shouldResume, force);
+    return parse(batchId, shouldResume, force, sitemap);
   }
 
   public static void main(String[] args) throws Exception {

Modified: nutch/branches/2.x/src/java/org/apache/nutch/storage/Mark.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/Mark.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/Mark.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/Mark.java Tue Jan 26 19:19:02 2016
@@ -19,8 +19,9 @@ package org.apache.nutch.storage;
 import org.apache.avro.util.Utf8;
 
 public enum Mark {
-  INJECT_MARK("_injmrk_"), GENERATE_MARK("_gnmrk_"), FETCH_MARK("_ftcmrk_"), PARSE_MARK(
-      "__prsmrk__"), UPDATEDB_MARK("_updmrk_"), INDEX_MARK("_idxmrk_");
+  INJECT_MARK("_injmrk_"), GENERATE_MARK("_gnmrk_"), FETCH_MARK("_ftcmrk_"),
+  PARSE_MARK("__prsmrk__"), UPDATEDB_MARK("_updmrk_"), INDEX_MARK("_idxmrk_"),
+  SITEMAP_MARK("_stmmrk_");
 
   private Utf8 name;