You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2016/01/26 20:19:03 UTC

svn commit: r1726853 [2/2] - in /nutch/branches/2.x: ./ conf/ src/gora/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/parse/ src/java/org...

Modified: nutch/branches/2.x/src/java/org/apache/nutch/storage/WebPage.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/WebPage.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/WebPage.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/WebPage.java Tue Jan 26 19:19:02 2016
@@ -27,7 +27,8 @@ public class WebPage extends org.apache.
     implements org.apache.avro.specific.SpecificRecord,
     org.apache.gora.persistency.Persistent {
   public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser()
-      .parse("{\"type\":\"record\",\"name\":\"WebPage\",\"namespace\":\"org.apache.nutch.storage\",\"doc\":\"WebPage is the primary data structure in Nutch representing crawl data for a given WebPage at some point in time\",\"fields\":[{\"name\":\"baseUrl\",\"type\":[\"null\",\"string\"],\"doc\":\"The original associated with this WebPage.\",\"default\":null},{\"name\":\"status\",\"type\":\"int\",\"doc\":\"A crawl status associated with the WebPage, can be of value STATUS_UNFETCHED - WebPage was not fetched yet, STATUS_FETCHED - WebPage was successfully fetched, STATUS_GONE - WebPage no longer exists, STATUS_REDIR_TEMP - WebPage temporarily redirects to other page, STATUS_REDIR_PERM - WebPage permanently redirects to other page, STATUS_RETRY - Fetching unsuccessful, needs to be retried e.g. transient errors and STATUS_NOTMODIFIED - fetching successful - page is not modified\",\"default\":0},{\"name\":\"fetchTime\",\"type\":\"long\",\"doc\":\"The system time in milliseconds for when 
 the page was fetched.\",\"default\":0},{\"name\":\"prevFetchTime\",\"type\":\"long\",\"doc\":\"The system time in milliseconds for when the page was last fetched if it was previously fetched which can be used to calculate time delta within a fetching schedule implementation\",\"default\":0},{\"name\":\"fetchInterval\",\"type\":\"int\",\"doc\":\"The default number of seconds between re-fetches of a page. The default is considered as 30 days unless a custom fetch schedle is implemented.\",\"default\":0},{\"name\":\"retriesSinceFetch\",\"type\":\"int\",\"doc\":\"The number of retried attempts at fetching the WebPage since it was last successfully fetched.\",\"default\":0},{\"name\":\"modifiedTime\",\"type\":\"long\",\"doc\":\"The system time in milliseconds for when this WebPage was modified by the WebPage author, if this is not available we default to the server for this information. This is important to understand the changing nature of the WebPage.\",\"default\":0},{\"name\":\"prevM
 odifiedTime\",\"type\":\"long\",\"doc\":\"The system time in milliseconds for when this WebPage was previously modified by the author, if this is not available then we default to the server for this information. This is important to understand the changing nature of a WebPage.\",\"default\":0},{\"name\":\"protocolStatus\",\"type\":[\"null\",{\"type\":\"record\",\"name\":\"ProtocolStatus\",\"doc\":\"A nested container representing data captured from web server responses.\",\"fields\":[{\"name\":\"code\",\"type\":\"int\",\"doc\":\"A protocol response code which can be one of SUCCESS - content was retrieved without errors, FAILED - Content was not retrieved. Any further errors may be indicated in args, PROTO_NOT_FOUND - This protocol was not found. Application may attempt to retry later, GONE - Resource is gone, MOVED - Resource has moved permanently. New url should be found in args, TEMP_MOVED - Resource has moved temporarily. New url should be found in args., NOTFOUND - Resource was 
 not found, RETRY - Temporary failure. Application may retry immediately., EXCEPTION - Unspecified exception occured. Further information may be provided in args., ACCESS_DENIED - Access denied - authorization required, but missing/incorrect., ROBOTS_DENIED - Access denied by robots.txt rules., REDIR_EXCEEDED - Too many redirects., NOTFETCHING - Not fetching., NOTMODIFIED - Unchanged since the last fetch., WOULDBLOCK - Request was refused by protocol plugins, because it would block. The expected number of milliseconds to wait before retry may be provided in args., BLOCKED - Thread was blocked http.max.delays times during fetching.\",\"default\":0},{\"name\":\"args\",\"type\":{\"type\":\"array\",\"items\":\"string\"},\"doc\":\"Optional arguments supplied to compliment and/or justify the response code.\",\"default\":[]},{\"name\":\"lastModified\",\"type\":\"long\",\"doc\":\"A server reponse indicating when this page was last modified, this can be unreliable at times hence this is used 
 as a default fall back value for the preferred 'modifiedTime' and 'preModifiedTime' obtained from the WebPage itself.\",\"default\":0}]}],\"default\":null},{\"name\":\"content\",\"type\":[\"null\",\"bytes\"],\"doc\":\"The entire raw document content e.g. raw XHTML\",\"default\":null},{\"name\":\"contentType\",\"type\":[\"null\",\"string\"],\"doc\":\"The type of the content contained within the document itself. ContentType is an alias for MimeType. Historically, this parameter was only called MimeType, but since this is actually the value included in the HTTP Content-Type header, it can also include the character set encoding, which makes it more than just a MimeType specification. If MimeType is specified e.g. not None, that value is used. Otherwise, ContentType is used. If neither is given, the DEFAULT_CONTENT_TYPE setting is used.\",\"default\":null},{\"name\":\"prevSignature\",\"type\":[\"null\",\"bytes\"],\"doc\":\"An implementation of a WebPage's previous signature from which i
 t can be identified and referenced at any point in time. This can be used to uniquely identify WebPage deltas based on page fingerprints.\",\"default\":null},{\"name\":\"signature\",\"type\":[\"null\",\"bytes\"],\"doc\":\"An implementation of a WebPage's signature from which it can be identified and referenced at any point in time. This is essentially the WebPage's fingerprint represnting its state for any point in time.\",\"default\":null},{\"name\":\"title\",\"type\":[\"null\",\"string\"],\"doc\":\"The title of the WebPage.\",\"default\":null},{\"name\":\"text\",\"type\":[\"null\",\"string\"],\"doc\":\"The textual content of the WebPage devoid from native markup.\",\"default\":null},{\"name\":\"parseStatus\",\"type\":[\"null\",{\"type\":\"record\",\"name\":\"ParseStatus\",\"doc\":\"A nested container representing parse status data captured from invocation of parsers on fetch of a WebPage\",\"fields\":[{\"name\":\"majorCode\",\"type\":\"int\",\"doc\":\"Major parsing status' includi
 ng NOTPARSED (Parsing was not performed), SUCCESS (Parsing succeeded), FAILED (General failure. There may be a more specific error message in arguments.)\",\"default\":0},{\"name\":\"minorCode\",\"type\":\"int\",\"doc\":\"Minor parsing status' including SUCCESS_OK - Successful parse devoid of anomalies or issues, SUCCESS_REDIRECT - Parsed content contains a directive to redirect to another URL. The target URL can be retrieved from the arguments., FAILED_EXCEPTION - Parsing failed. An Exception occured which may be retrieved from the arguments., FAILED_TRUNCATED - Parsing failed. Content was truncated, but the parser cannot handle incomplete content., FAILED_INVALID_FORMAT - Parsing failed. Invalid format e.g. the content may be corrupted or of wrong type., FAILED_MISSING_PARTS - Parsing failed. Other related parts of the content are needed to complete parsing. The list of URLs to missing parts may be provided in arguments. The Fetcher may decide to fetch these parts at once, then pu
 t them into Content.metadata, and supply them for re-parsing., FAILED_MISING_CONTENT - Parsing failed. There was no content to be parsed - probably caused by errors at protocol stage.\",\"default\":0},{\"name\":\"args\",\"type\":{\"type\":\"array\",\"items\":\"string\"},\"doc\":\"Optional arguments supplied to compliment and/or justify the parse status code.\",\"default\":[]}]}],\"default\":null},{\"name\":\"score\",\"type\":\"float\",\"doc\":\"A score used to determine a WebPage's relevance within the web graph it is part of. This score may change over time based on graph characteristics.\",\"default\":0},{\"name\":\"reprUrl\",\"type\":[\"null\",\"string\"],\"doc\":\"In the case where we are given two urls, a source and a destination of a redirect, we should determine and persist the representative url. The logic used to determine this is based largely on Yahoo!'s Slurp Crawler\",\"default\":null},{\"name\":\"headers\",\"type\":{\"type\":\"map\",\"values\":[\"null\",\"string\"]},\"
 doc\":\"Header information returned from the web server used to server the content which is subsequently fetched from. This includes keys such as TRANSFER_ENCODING, CONTENT_ENCODING, CONTENT_LANGUAGE, CONTENT_LENGTH, CONTENT_LOCATION, CONTENT_DISPOSITION, CONTENT_MD5, CONTENT_TYPE, LAST_MODIFIED and LOCATION.\",\"default\":{}},{\"name\":\"outlinks\",\"type\":{\"type\":\"map\",\"values\":[\"null\",\"string\"]},\"doc\":\"Embedded hyperlinks which direct outside of the current domain.\",\"default\":{}},{\"name\":\"inlinks\",\"type\":{\"type\":\"map\",\"values\":[\"null\",\"string\"]},\"doc\":\"Embedded hyperlinks which link to pages within the current domain.\",\"default\":{}},{\"name\":\"markers\",\"type\":{\"type\":\"map\",\"values\":[\"null\",\"string\"]},\"doc\":\"Markers flags which represent user and machine decisions which have affected influenced a WebPage's current state. Markers can be system specific and user machine driven in nature. They are assigned to a WebPage on a job-
 by-job basis and thier values indicative of what actions should be associated with a WebPage.\",\"default\":{}},{\"name\":\"metadata\",\"type\":{\"type\":\"map\",\"values\":[\"null\",\"bytes\"]},\"doc\":\"A multi-valued metadata container used for storing everything from structured WebPage characterists, to ad-hoc extraction and metadata augmentation for any given WebPage.\",\"default\":{}},{\"name\":\"batchId\",\"type\":[\"null\",\"string\"],\"doc\":\"A batchId that this WebPage is assigned to. WebPage's are fetched in batches, called fetchlists. Pages are partitioned but can always be associated and fetched alongside pages of similar value (within a crawl cycle) based on batchId.\",\"default\":null}]}");
+      .parse(
+          "{\"type\":\"record\",\"name\":\"WebPage\",\"namespace\":\"org.apache.nutch.storage\",\"doc\":\"WebPage is the primary data structure in Nutch representing crawl data for a given WebPage at some point in time\",\"fields\":[{\"name\":\"baseUrl\",\"type\":[\"null\",\"string\"],\"doc\":\"The original associated with this WebPage.\",\"default\":null},{\"name\":\"status\",\"type\":\"int\",\"doc\":\"A crawl status associated with the WebPage, can be of value STATUS_UNFETCHED - WebPage was not fetched yet, STATUS_FETCHED - WebPage was successfully fetched, STATUS_GONE - WebPage no longer exists, STATUS_REDIR_TEMP - WebPage temporarily redirects to other page, STATUS_REDIR_PERM - WebPage permanently redirects to other page, STATUS_RETRY - Fetching unsuccessful, needs to be retried e.g. transient errors and STATUS_NOTMODIFIED - fetching successful - page is not modified\",\"default\":0},{\"name\":\"fetchTime\",\"type\":\"long\",\"doc\":\"The system time in milliseconds for when the
  page was fetched.\",\"default\":0},{\"name\":\"prevFetchTime\",\"type\":\"long\",\"doc\":\"The system time in milliseconds for when the page was last fetched if it was previously fetched which can be used to calculate time delta within a fetching schedule implementation\",\"default\":0},{\"name\":\"fetchInterval\",\"type\":\"int\",\"doc\":\"The default number of seconds between re-fetches of a page. The default is considered as 30 days unless a custom fetch schedle is implemented.\",\"default\":0},{\"name\":\"retriesSinceFetch\",\"type\":\"int\",\"doc\":\"The number of retried attempts at fetching the WebPage since it was last successfully fetched.\",\"default\":0},{\"name\":\"modifiedTime\",\"type\":\"long\",\"doc\":\"The system time in milliseconds for when this WebPage was modified by the WebPage author, if this is not available we default to the server for this information. This is important to understand the changing nature of the WebPage.\",\"default\":0},{\"name\":\"prevModi
 fiedTime\",\"type\":\"long\",\"doc\":\"The system time in milliseconds for when this WebPage was previously modified by the author, if this is not available then we default to the server for this information. This is important to understand the changing nature of a WebPage.\",\"default\":0},{\"name\":\"protocolStatus\",\"type\":[\"null\",{\"type\":\"record\",\"name\":\"ProtocolStatus\",\"doc\":\"A nested container representing data captured from web server responses.\",\"fields\":[{\"name\":\"code\",\"type\":\"int\",\"doc\":\"A protocol response code which can be one of SUCCESS - content was retrieved without errors, FAILED - Content was not retrieved. Any further errors may be indicated in args, PROTO_NOT_FOUND - This protocol was not found. Application may attempt to retry later, GONE - Resource is gone, MOVED - Resource has moved permanently. New url should be found in args, TEMP_MOVED - Resource has moved temporarily. New url should be found in args., NOTFOUND - Resource was not
  found, RETRY - Temporary failure. Application may retry immediately., EXCEPTION - Unspecified exception occured. Further information may be provided in args., ACCESS_DENIED - Access denied - authorization required, but missing/incorrect., ROBOTS_DENIED - Access denied by robots.txt rules., REDIR_EXCEEDED - Too many redirects., NOTFETCHING - Not fetching., NOTMODIFIED - Unchanged since the last fetch., WOULDBLOCK - Request was refused by protocol plugins, because it would block. The expected number of milliseconds to wait before retry may be provided in args., BLOCKED - Thread was blocked http.max.delays times during fetching.\",\"default\":0},{\"name\":\"args\",\"type\":{\"type\":\"array\",\"items\":\"string\"},\"doc\":\"Optional arguments supplied to compliment and/or justify the response code.\",\"default\":[]},{\"name\":\"lastModified\",\"type\":\"long\",\"doc\":\"A server reponse indicating when this page was last modified, this can be unreliable at times hence this is used as 
 a default fall back value for the preferred 'modifiedTime' and 'preModifiedTime' obtained from the WebPage itself.\",\"default\":0}]}],\"default\":null},{\"name\":\"content\",\"type\":[\"null\",\"bytes\"],\"doc\":\"The entire raw document content e.g. raw XHTML\",\"default\":null},{\"name\":\"contentType\",\"type\":[\"null\",\"string\"],\"doc\":\"The type of the content contained within the document itself. ContentType is an alias for MimeType. Historically, this parameter was only called MimeType, but since this is actually the value included in the HTTP Content-Type header, it can also include the character set encoding, which makes it more than just a MimeType specification. If MimeType is specified e.g. not None, that value is used. Otherwise, ContentType is used. If neither is given, the DEFAULT_CONTENT_TYPE setting is used.\",\"default\":null},{\"name\":\"prevSignature\",\"type\":[\"null\",\"bytes\"],\"doc\":\"An implementation of a WebPage's previous signature from which it c
 an be identified and referenced at any point in time. This can be used to uniquely identify WebPage deltas based on page fingerprints.\",\"default\":null},{\"name\":\"signature\",\"type\":[\"null\",\"bytes\"],\"doc\":\"An implementation of a WebPage's signature from which it can be identified and referenced at any point in time. This is essentially the WebPage's fingerprint represnting its state for any point in time.\",\"default\":null},{\"name\":\"title\",\"type\":[\"null\",\"string\"],\"doc\":\"The title of the WebPage.\",\"default\":null},{\"name\":\"text\",\"type\":[\"null\",\"string\"],\"doc\":\"The textual content of the WebPage devoid from native markup.\",\"default\":null},{\"name\":\"parseStatus\",\"type\":[\"null\",{\"type\":\"record\",\"name\":\"ParseStatus\",\"doc\":\"A nested container representing parse status data captured from invocation of parsers on fetch of a WebPage\",\"fields\":[{\"name\":\"majorCode\",\"type\":\"int\",\"doc\":\"Major parsing status' including 
 NOTPARSED (Parsing was not performed), SUCCESS (Parsing succeeded), FAILED (General failure. There may be a more specific error message in arguments.)\",\"default\":0},{\"name\":\"minorCode\",\"type\":\"int\",\"doc\":\"Minor parsing status' including SUCCESS_OK - Successful parse devoid of anomalies or issues, SUCCESS_REDIRECT - Parsed content contains a directive to redirect to another URL. The target URL can be retrieved from the arguments., FAILED_EXCEPTION - Parsing failed. An Exception occured which may be retrieved from the arguments., FAILED_TRUNCATED - Parsing failed. Content was truncated, but the parser cannot handle incomplete content., FAILED_INVALID_FORMAT - Parsing failed. Invalid format e.g. the content may be corrupted or of wrong type., FAILED_MISSING_PARTS - Parsing failed. Other related parts of the content are needed to complete parsing. The list of URLs to missing parts may be provided in arguments. The Fetcher may decide to fetch these parts at once, then put t
 hem into Content.metadata, and supply them for re-parsing., FAILED_MISING_CONTENT - Parsing failed. There was no content to be parsed - probably caused by errors at protocol stage.\",\"default\":0},{\"name\":\"args\",\"type\":{\"type\":\"array\",\"items\":\"string\"},\"doc\":\"Optional arguments supplied to compliment and/or justify the parse status code.\",\"default\":[]}]}],\"default\":null},{\"name\":\"score\",\"type\":\"float\",\"doc\":\"A score used to determine a WebPage's relevance within the web graph it is part of. This score may change over time based on graph characteristics.\",\"default\":0},{\"name\":\"reprUrl\",\"type\":[\"null\",\"string\"],\"doc\":\"In the case where we are given two urls, a source and a destination of a redirect, we should determine and persist the representative url. The logic used to determine this is based largely on Yahoo!'s Slurp Crawler\",\"default\":null},{\"name\":\"headers\",\"type\":{\"type\":\"map\",\"values\":[\"null\",\"string\"]},\"doc
 \":\"Header information returned from the web server used to server the content which is subsequently fetched from. This includes keys such as TRANSFER_ENCODING, CONTENT_ENCODING, CONTENT_LANGUAGE, CONTENT_LENGTH, CONTENT_LOCATION, CONTENT_DISPOSITION, CONTENT_MD5, CONTENT_TYPE, LAST_MODIFIED and LOCATION.\",\"default\":{}},{\"name\":\"outlinks\",\"type\":{\"type\":\"map\",\"values\":[\"null\",\"string\"]},\"doc\":\"Embedded hyperlinks which direct outside of the current domain.\",\"default\":{}},{\"name\":\"inlinks\",\"type\":{\"type\":\"map\",\"values\":[\"null\",\"string\"]},\"doc\":\"Embedded hyperlinks which link to pages within the current domain.\",\"default\":{}},{\"name\":\"markers\",\"type\":{\"type\":\"map\",\"values\":[\"null\",\"string\"]},\"doc\":\"Markers flags which represent user and machine decisions which have affected influenced a WebPage's current state. Markers can be system specific and user machine driven in nature. They are assigned to a WebPage on a job-by-
 job basis and thier values indicative of what actions should be associated with a WebPage.\",\"default\":{}},{\"name\":\"metadata\",\"type\":{\"type\":\"map\",\"values\":[\"null\",\"bytes\"]},\"doc\":\"A multi-valued metadata container used for storing everything from structured WebPage characterists, to ad-hoc extraction and metadata augmentation for any given WebPage.\",\"default\":{}},{\"name\":\"batchId\",\"type\":[\"null\",\"string\"],\"doc\":\"A batchId that this WebPage is assigned to. WebPage's are fetched in batches, called fetchlists. Pages are partitioned but can always be associated and fetched alongside pages of similar value (within a crawl cycle) based on batchId.\",\"default\":null},{\"name\":\"sitemaps\",\"type\":{\"type\":\"map\",\"values\":[\"null\",\"string\"]},\"doc\":\"Sitemap urls in robot.txt\",\"default\":{}},{\"name\":\"stmPriority\",\"type\":\"float\",\"doc\":\"A priority  inlinks from sitemap file\",\"default\":0}]}");
 
   /** Enum containing all data bean's fields. */
   public static enum Field {
@@ -40,7 +41,8 @@ public class WebPage extends org.apache.
         14, "text"), PARSE_STATUS(15, "parseStatus"), SCORE(16, "score"), REPR_URL(
         17, "reprUrl"), HEADERS(18, "headers"), OUTLINKS(19, "outlinks"), INLINKS(
         20, "inlinks"), MARKERS(21, "markers"), METADATA(22, "metadata"), BATCH_ID(
-        23, "batchId"), ;
+        23, "batchId"), SITEMAPS(24, "sitemaps"), STM_PRIORITY(25,
+        "stmPriority"),;
     /**
      * Field's index.
      */
@@ -97,7 +99,7 @@ public class WebPage extends org.apache.
       "modifiedTime", "prevModifiedTime", "protocolStatus", "content",
       "contentType", "prevSignature", "signature", "title", "text",
       "parseStatus", "score", "reprUrl", "headers", "outlinks", "inlinks",
-      "markers", "metadata", "batchId", };
+      "markers", "metadata", "batchId", "sitemaps", "stmPriority" };
 
   /**
    * Gets the total field count.
@@ -187,6 +189,8 @@ public class WebPage extends org.apache.
    * part of. This score may change over time based on graph characteristics.
    */
   private float score;
+
+  private float stmPriority;
   /**
    * In the case where we are given two urls, a source and a destination of a
    * redirect, we should determine and persist the representative url. The logic
@@ -227,6 +231,8 @@ public class WebPage extends org.apache.
    */
   private java.lang.CharSequence batchId;
 
+  private java.util.Map<java.lang.CharSequence, java.lang.CharSequence> sitemaps;
+
   public org.apache.avro.Schema getSchema() {
     return SCHEMA$;
   }
@@ -282,6 +288,10 @@ public class WebPage extends org.apache.
       return metadata;
     case 23:
       return batchId;
+    case 24:
+      return sitemaps;
+    case 25:
+      return stmPriority;
     default:
       throw new org.apache.avro.AvroRuntimeException("Bad index");
     }
@@ -373,6 +383,14 @@ public class WebPage extends org.apache.
     case 23:
       batchId = (java.lang.CharSequence) (value);
       break;
+    case 24:
+      sitemaps = (java.util.Map<java.lang.CharSequence, java.lang.CharSequence>) ((value instanceof org.apache.gora.persistency.Dirtyable) ? value
+          : new org.apache.gora.persistency.impl.DirtyMapWrapper(
+          (java.util.Map) value));
+      break;
+    case 25:
+      stmPriority = (java.lang.Float) (value);
+      break;
     default:
       throw new org.apache.avro.AvroRuntimeException("Bad index");
     }
@@ -911,6 +929,33 @@ public class WebPage extends org.apache.
   }
 
   /**
+   * Gets the value of the 'stmPriority' field. A stmPriority indicate
+   * priority value of the urls from sitemap.
+   */
+  public java.lang.Float getStmPriority() {
+    return stmPriority;
+  }
+
+  /**
+   * Sets the value of the 'stmPriority' field. A stmPriority indicate
+   * priority value of the urls from sitemap. * @param value the value to set.
+   */
+  public void setStmPriority(java.lang.Float value) {
+    this.stmPriority = value;
+    setDirty(25);
+  }
+
+  /**
+   * Checks the dirty status of the 'stmPriority' field. A field is dirty if it
+   * represents a change that has not yet been written to the database.
+   * A stmPriority indicate priority value of the urls from sitemap. * @param
+   * value the value to set.
+   */
+  public boolean isStmPriorityDirty(java.lang.Float value) {
+    return isDirty(25);
+  }
+
+  /**
    * Gets the value of the 'reprUrl' field. In the case where we are given two
    * urls, a source and a destination of a redirect, we should determine and
    * persist the representative url. The logic used to determine this is based
@@ -1151,6 +1196,36 @@ public class WebPage extends org.apache.
     return isDirty(23);
   }
 
+
+  /**
+   * Gets the value of the 'sitemaps' field. Sitemap list for the current host.
+   */
+  public java.util.Map<java.lang.CharSequence, java.lang.CharSequence> getSitemaps() {
+    return sitemaps;
+  }
+
+  /**
+   * Sets the value of the 'sitemap' field. Sitemap list for the current host.
+   * @param value the value to set.
+   */
+  public void setSitemaps(
+      java.util.Map<java.lang.CharSequence, java.lang.CharSequence> value) {
+    this.sitemaps = (value instanceof org.apache.gora.persistency.Dirtyable) ? value
+        : new org.apache.gora.persistency.impl.DirtyMapWrapper(value);
+    setDirty(24);
+  }
+
+  /**
+   * Checks the dirty status of the 'sitemap' field. A field is dirty if it
+   * represents a change that has not yet been written to the database.
+   * Sets the value of the 'sitemap' field. Sitemap list for the current host.
+   * @param value the value to set.
+   */
+  public boolean isSitemapsDirty(
+      java.util.Map<java.lang.CharSequence, java.lang.CharSequence> value) {
+    return isDirty(24);
+  }
+
   /** Creates a new WebPage RecordBuilder */
   public static org.apache.nutch.storage.WebPage.Builder newBuilder() {
     return new org.apache.nutch.storage.WebPage.Builder();
@@ -1217,6 +1292,7 @@ public class WebPage extends org.apache.
     private java.lang.CharSequence text;
     private org.apache.nutch.storage.ParseStatus parseStatus;
     private float score;
+    private float stmPriority;
     private java.lang.CharSequence reprUrl;
     private java.util.Map<java.lang.CharSequence, java.lang.CharSequence> headers;
     private java.util.Map<java.lang.CharSequence, java.lang.CharSequence> outlinks;
@@ -1224,6 +1300,8 @@ public class WebPage extends org.apache.
     private java.util.Map<java.lang.CharSequence, java.lang.CharSequence> markers;
     private java.util.Map<java.lang.CharSequence, java.nio.ByteBuffer> metadata;
     private java.lang.CharSequence batchId;
+    private java.util.Map<java.lang.CharSequence, java.lang.CharSequence> sitemaps;
+
 
     /** Creates a new Builder */
     private Builder() {
@@ -1358,6 +1436,17 @@ public class WebPage extends org.apache.
             fields()[23].schema(), other.batchId);
         fieldSetFlags()[23] = true;
       }
+      if (isValidValue(fields()[24], other.sitemaps)) {
+        this.sitemaps = (java.util.Map<java.lang.CharSequence, java.lang.CharSequence>) data()
+            .deepCopy(fields()[24].schema(), other.sitemaps);
+        fieldSetFlags()[24] = true;
+      }
+      if (isValidValue(fields()[25], other.stmPriority)) {
+        this.stmPriority = (java.lang.Float) data()
+            .deepCopy(fields()[25].schema(),
+                other.stmPriority);
+        fieldSetFlags()[25] = true;
+      }
     }
 
     /** Gets the value of the 'baseUrl' field */
@@ -1970,6 +2059,44 @@ public class WebPage extends org.apache.
       return this;
     }
 
+    /** Gets the value of the 'sitemaps' field */
+    public java.util.Map<java.lang.CharSequence, java.lang.CharSequence> getSitemaps() {
+      return sitemaps;
+    }
+
+    /** Sets the value of the 'sitemaps' field */
+    public org.apache.nutch.storage.WebPage.Builder setSitemaps(
+        java.util.Map<java.lang.CharSequence, java.lang.CharSequence> value) {
+      validate(fields()[24], value);
+      this.sitemaps = value;
+      fieldSetFlags()[24] = true;
+      return this;
+    }
+
+    /** Checks whether the 'sitemaps' field has been set */
+    public boolean hasSitemaps() {
+      return fieldSetFlags()[24];
+    }
+
+    /** Clears the value of the 'sitemaps' field */
+    public org.apache.nutch.storage.WebPage.Builder clearSitemaps() {
+      sitemaps = null;
+      fieldSetFlags()[24] = false;
+      return this;
+    }
+
+    public java.lang.Float getStmPriority() {
+      return stmPriority;
+    }
+
+    public org.apache.nutch.storage.WebPage.Builder setStmPriority(
+        float value) {
+      validate(fields()[25], value);
+      this.stmPriority = value;
+      fieldSetFlags()[25] = true;
+      return this;
+    }
+
     @Override
     public WebPage build() {
       try {
@@ -2027,6 +2154,11 @@ public class WebPage extends org.apache.
                 (java.util.Map) defaultValue(fields()[22]));
         record.batchId = fieldSetFlags()[23] ? this.batchId
             : (java.lang.CharSequence) defaultValue(fields()[23]);
+        record.sitemaps = fieldSetFlags()[24] ? this.sitemaps
+            : (java.util.Map<java.lang.CharSequence, java.lang.CharSequence>) new org.apache.gora.persistency.impl.DirtyMapWrapper(
+            (java.util.Map) defaultValue(fields()[24]));
+        record.stmPriority = fieldSetFlags()[25] ? this.stmPriority
+            : (java.lang.Float) defaultValue(fields()[25]);
         return record;
       } catch (Exception e) {
         throw new org.apache.avro.AvroRuntimeException(e);
@@ -2873,6 +3005,63 @@ public class WebPage extends org.apache.
       throw new java.lang.UnsupportedOperationException(
           "IsDirty is not supported on tombstones");
     }
+
+    /**
+     * Gets the value of the 'sitemaps' field. Sitemap list for the current host.
+     */
+    public java.util.Map<java.lang.CharSequence, java.lang.CharSequence> getSitemaps() {
+      throw new java.lang.UnsupportedOperationException(
+          "Get is not supported on tombstones");
+    }
+
+    /**
+     * Sets the value of the 'sitemaps' field. Sitemap list for the current
+     * host. @param value the value to set.
+     */
+    public void setSitemaps(
+        java.util.Map<java.lang.CharSequence, java.lang.CharSequence> value) {
+      throw new java.lang.UnsupportedOperationException(
+          "Set is not supported on tombstones");
+    }
+
+    /**
+     * Checks the dirty status of the 'batchId' field. A field is dirty if it
+     * represents a change that has not yet been written to the database. Sitemap
+     * list for the current host. @param value the value to set.
+     */
+    public boolean isSitemapsDirty(
+        java.util.Map<java.lang.CharSequence, java.lang.CharSequence> value) {
+      throw new java.lang.UnsupportedOperationException(
+          "IsDirty is not supported on tombstones");
+    }
+
+    /**
+     * Gets the value of the 'stmPriority' field. A stmPriority indicate
+     * priority value of the urls from sitemap.
+     */
+    public java.lang.Float getStmPriority() {
+      throw new java.lang.UnsupportedOperationException(
+          "Get is not supported on tombstones");
+    }
+
+    /**
+     * Sets the value of the 'stmPriority' field. A stmPriority indicate
+     * priority value of the urls from sitemap. * @param value the value to set.
+     */
+    public void setStmPriority(java.lang.Float value) {
+      throw new java.lang.UnsupportedOperationException(
+          "Set is not supported on tombstones");
+    }
+    /**
+     * Checks the dirty status of the 'stmPriority' field. A field is dirty if it
+     * represents a change that has not yet been written to the database.
+     * A stmPriority indicate priority value of the urls from sitemap. * @param
+     * value the value to set.
+     */
+    public boolean isStmPriorityDirty(java.lang.Float value) {
+      throw new java.lang.UnsupportedOperationException(
+          "IsDirty is not supported on tombstones");
+    }
 
   }
 

Modified: nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/tools/Benchmark.java Tue Jan 26 19:19:02 2016
@@ -227,7 +227,7 @@ public class Benchmark extends Configure
     for (i = 0; i < depth; i++) { // generate new batch
       start = System.currentTimeMillis();
       String batchId = generator.generate(topN, System.currentTimeMillis(),
-          false, false);
+          false, false, false);
       delta = System.currentTimeMillis() - start;
       res.addTiming("generate", i + "", delta);
       if (batchId == null) {

Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Tue Jan 26 19:19:02 2016
@@ -129,7 +129,12 @@ public abstract class HttpBase implement
     this.proxyPort = conf.getInt("http.proxy.port", 8080);
     this.useProxy = (proxyHost != null && proxyHost.length() > 0);
     this.timeout = conf.getInt("http.timeout", 10000);
-    this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
+    boolean sitemap = conf.getBoolean("fetcher.job.sitemap", false);
+    if (sitemap) {
+      this.maxContent = -1;
+    } else {
+      this.maxContent = conf.getInt("http.content.limit", 64 * 1024);
+    }
     this.userAgent = getAgentString(conf.get("http.agent.name"),
         conf.get("http.agent.version"), conf.get("http.agent.description"),
         conf.get("http.agent.url"), conf.get("http.agent.email"));

Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java Tue Jan 26 19:19:02 2016
@@ -17,6 +17,7 @@
 package org.apache.nutch.crawl;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
 import org.apache.nutch.storage.Mark;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.AbstractNutchTest;
@@ -26,8 +27,6 @@ import org.junit.After;
 import org.junit.Before;
 import org.junit.Ignore;
 import org.junit.Test;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.util.ArrayList;
@@ -48,8 +47,6 @@ import static org.junit.Assert.assertEqu
  */
 public class TestGenerator extends AbstractNutchTest {
 
-  public static final Logger LOG = LoggerFactory.getLogger(TestGenerator.class);
-
   private static String[] FIELDS = new String[] {
       WebPage.Field.MARKERS.getName(), WebPage.Field.SCORE.getName() };
 
@@ -74,6 +71,9 @@ public class TestGenerator extends Abstr
   @Ignore("GORA-240 Tests for MemStore")
   public void testGenerateHighest() throws Exception {
 
+    String batchId = "1234";
+    conf.set(GeneratorJob.BATCH_ID, batchId);
+
     final int NUM_RESULTS = 2;
 
     ArrayList<URLWebPage> list = new ArrayList<URLWebPage>();
@@ -85,9 +85,7 @@ public class TestGenerator extends Abstr
     for (URLWebPage uwp : list) {
       webPageStore.put(TableUtil.reverseUrl(uwp.getUrl()), uwp.getDatum());
     }
-    webPageStore.flush();
-
-    generateFetchlist(NUM_RESULTS, conf, false);
+    CrawlTestUtil.generateFetchlist(NUM_RESULTS, conf, false, false);
 
     ArrayList<URLWebPage> l = CrawlTestUtil.readContents(webPageStore,
         Mark.GENERATE_MARK, FIELDS);
@@ -136,6 +134,10 @@ public class TestGenerator extends Abstr
   @Test
   @Ignore("GORA-240 Tests for MemStore")
   public void testGenerateHostLimit() throws Exception {
+
+    String batchId = "1234";
+    conf.set(GeneratorJob.BATCH_ID, batchId);
+
     ArrayList<URLWebPage> list = new ArrayList<URLWebPage>();
 
     list.add(createURLWebPage("http://www.example.com/index1.html", 1, 1));
@@ -145,13 +147,13 @@ public class TestGenerator extends Abstr
     for (URLWebPage uwp : list) {
       webPageStore.put(TableUtil.reverseUrl(uwp.getUrl()), uwp.getDatum());
     }
-    webPageStore.flush();
 
     Configuration myConfiguration = new Configuration(conf);
     myConfiguration.setInt(GeneratorJob.GENERATOR_MAX_COUNT, 1);
     myConfiguration.set(GeneratorJob.GENERATOR_COUNT_MODE,
         GeneratorJob.GENERATOR_COUNT_VALUE_HOST);
-    generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
+    CrawlTestUtil.generateFetchlist(Integer.MAX_VALUE, myConfiguration, false,
+        false);
 
     ArrayList<URLWebPage> fetchList = CrawlTestUtil.readContents(webPageStore,
         Mark.GENERATE_MARK, FIELDS);
@@ -161,8 +163,8 @@ public class TestGenerator extends Abstr
 
     myConfiguration = new Configuration(conf);
     myConfiguration.setInt(GeneratorJob.GENERATOR_MAX_COUNT, 2);
-    generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
-
+    CrawlTestUtil.generateFetchlist(Integer.MAX_VALUE, myConfiguration, false,
+        false);
     fetchList = CrawlTestUtil.readContents(webPageStore, Mark.GENERATE_MARK,
         FIELDS);
 
@@ -171,8 +173,8 @@ public class TestGenerator extends Abstr
 
     myConfiguration = new Configuration(conf);
     myConfiguration.setInt(GeneratorJob.GENERATOR_MAX_COUNT, 3);
-    generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
-
+    CrawlTestUtil.generateFetchlist(Integer.MAX_VALUE, myConfiguration, false,
+        false);
     fetchList = CrawlTestUtil.readContents(webPageStore, Mark.GENERATE_MARK,
         FIELDS);
 
@@ -189,6 +191,8 @@ public class TestGenerator extends Abstr
   @Test
   @Ignore("GORA-240 Tests for MemStore")
   public void testGenerateDomainLimit() throws Exception {
+    String batchId = "1234";
+    conf.set(GeneratorJob.BATCH_ID, batchId);
     ArrayList<URLWebPage> list = new ArrayList<URLWebPage>();
 
     list.add(createURLWebPage("http://one.example.com/index.html", 1, 1));
@@ -201,15 +205,14 @@ public class TestGenerator extends Abstr
     for (URLWebPage uwp : list) {
       webPageStore.put(TableUtil.reverseUrl(uwp.getUrl()), uwp.getDatum());
     }
-    webPageStore.flush();
 
     Configuration myConfiguration = new Configuration(conf);
     myConfiguration.setInt(GeneratorJob.GENERATOR_MAX_COUNT, 1);
     myConfiguration.set(GeneratorJob.GENERATOR_COUNT_MODE,
         GeneratorJob.GENERATOR_COUNT_VALUE_DOMAIN);
 
-    generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
-
+    CrawlTestUtil.generateFetchlist(Integer.MAX_VALUE, myConfiguration, false,
+        false);
     ArrayList<URLWebPage> fetchList = CrawlTestUtil.readContents(webPageStore,
         Mark.GENERATE_MARK, FIELDS);
 
@@ -218,8 +221,8 @@ public class TestGenerator extends Abstr
 
     myConfiguration = new Configuration(myConfiguration);
     myConfiguration.setInt(GeneratorJob.GENERATOR_MAX_COUNT, 2);
-    generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
-
+    CrawlTestUtil.generateFetchlist(Integer.MAX_VALUE, myConfiguration, false,
+        false);
     fetchList = CrawlTestUtil.readContents(webPageStore, Mark.GENERATE_MARK,
         FIELDS);
 
@@ -228,8 +231,8 @@ public class TestGenerator extends Abstr
 
     myConfiguration = new Configuration(myConfiguration);
     myConfiguration.setInt(GeneratorJob.GENERATOR_MAX_COUNT, 3);
-    generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
-
+    CrawlTestUtil.generateFetchlist(Integer.MAX_VALUE, myConfiguration, false,
+        false);
     fetchList = CrawlTestUtil.readContents(webPageStore, Mark.GENERATE_MARK,
         FIELDS);
 
@@ -247,6 +250,9 @@ public class TestGenerator extends Abstr
   @Ignore("GORA-240 Tests for MemStore")
   public void testFilter() throws IOException, Exception {
 
+    String batchId = "1234";
+    conf.set(GeneratorJob.BATCH_ID, batchId);
+
     ArrayList<URLWebPage> list = new ArrayList<URLWebPage>();
 
     list.add(createURLWebPage("http://www.example.com/index.html", 1, 1));
@@ -256,20 +262,19 @@ public class TestGenerator extends Abstr
     for (URLWebPage uwp : list) {
       webPageStore.put(TableUtil.reverseUrl(uwp.getUrl()), uwp.getDatum());
     }
-    webPageStore.flush();
 
     Configuration myConfiguration = new Configuration(conf);
     myConfiguration.set("urlfilter.suffix.file", "filter-all.txt");
 
-    generateFetchlist(Integer.MAX_VALUE, myConfiguration, true);
-
+    CrawlTestUtil.generateFetchlist(Integer.MAX_VALUE, myConfiguration, true,
+        false);
     ArrayList<URLWebPage> fetchList = CrawlTestUtil.readContents(webPageStore,
         Mark.GENERATE_MARK, FIELDS);
 
     assertEquals(0, fetchList.size());
 
-    generateFetchlist(Integer.MAX_VALUE, myConfiguration, false);
-
+    CrawlTestUtil.generateFetchlist(Integer.MAX_VALUE, myConfiguration, true,
+        false);
     fetchList = CrawlTestUtil.readContents(webPageStore, Mark.GENERATE_MARK,
         FIELDS);
 
@@ -278,25 +283,72 @@ public class TestGenerator extends Abstr
 
   }
 
+  @Test
+  public void testGenerateOnlySitemap() throws Exception {
+    boolean sitemap = true;
+    ArrayList<String> urls = new ArrayList<String>();
+    for (int i = 0; i < 10; i++) {
+      urls.add("http://zzz.com/" + i + ".html\tnutch.score=" + i
+          + "\tcustom.attribute=" + i);
+    }
+    int sitemapUrlCnt = 2;
+    for (int i = 10; i < 10 + sitemapUrlCnt; i++) {
+      urls.add("http://zzz.com/" + i + ".html\tnutch.score=" + i
+          + "\tcustom.attribute=" + i
+          + "\t-sitemap");
+    }
+
+    ArrayList<URLWebPage> fetchList = generateForSitemap(urls, sitemap);
+
+    assertEquals(2, fetchList.size());
+  }
+  
   /**
-   * Generate Fetchlist.
-   * 
-   * @param numResults
-   *          number of results to generate
-   * @param config
-   *          Configuration to use
-   * @return path to generated batch
-   * @throws IOException
+   * Test that generator generates fetchlist for only sitemaps.
+   *
+   * @throws Exception
    */
-  private void generateFetchlist(int numResults, Configuration config,
-      boolean filter) throws Exception {
-    // generate batch
-    GeneratorJob g = new GeneratorJob();
-    g.setConf(config);
-    String batchId = g.generate(numResults, System.currentTimeMillis(), filter,
-        false);
-    if (batchId == null)
-      throw new RuntimeException("Generator failed");
+  @Test
+  public void testGenerateNoneSitemap() throws Exception {
+    boolean sitemap = false;
+    ArrayList<String> urls = new ArrayList<String>();
+    for (int i = 0; i < 10; i++) {
+      urls.add("http://zzz.com/" + i + ".html\tnutch.score=" + i
+          + "\tcustom.attribute=" + i);
+    }
+    int sitemapUrlCnt = 2;
+    for (int i = 10; i < 10 + sitemapUrlCnt; i++) {
+      urls.add("http://zzz.com/" + i + ".html\tnutch.score=" + i
+          + "\tcustom.attribute=" + i
+          + "\t-sitemap");
+    }
+
+    ArrayList<URLWebPage> fetchList = generateForSitemap(urls, sitemap);
+
+    assertEquals(10, fetchList.size());
+
+  }
+
+  private ArrayList<URLWebPage> generateForSitemap(ArrayList<String> urls,
+      boolean sitemap) throws Exception {
+    Path urlPath = new Path(testdir, "urls");
+    String batchId = "1234";
+    conf.set(GeneratorJob.BATCH_ID, batchId);
+
+    CrawlTestUtil.generateSeedList(fs, urlPath, urls);
+
+    InjectorJob injector = new InjectorJob();
+    injector.setConf(conf);
+    injector.inject(urlPath);
+
+    Configuration myConfiguration = new Configuration(conf);
+    CrawlTestUtil.generateFetchlist(Integer.MAX_VALUE, myConfiguration, true,
+        sitemap);
+
+    ArrayList<URLWebPage> fetchList = CrawlTestUtil.readContents(webPageStore,
+        Mark.GENERATE_MARK, FIELDS);
+
+    return fetchList;
   }
 
   /**

Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java Tue Jan 26 19:19:02 2016
@@ -18,12 +18,13 @@ package org.apache.nutch.crawl;
 
 import org.apache.avro.util.Utf8;
 import org.apache.hadoop.fs.Path;
+import org.apache.nutch.net.URLFilters;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.AbstractNutchTest;
 import org.apache.nutch.util.Bytes;
 import org.apache.nutch.util.CrawlTestUtil;
+import org.junit.After;
 import org.junit.Before;
-import org.junit.Ignore;
 import org.junit.Test;
 
 import java.nio.ByteBuffer;
@@ -50,6 +51,120 @@ public class TestInjector extends Abstra
     urlPath = new Path(testdir, "urls");
   }
 
+  /**
+   * Test that injector for sitemap url
+   *
+   * @throws Exception
+   */
+  @Test
+  public void testSitemapInject() throws Exception {
+    ArrayList<String> urls = new ArrayList<String>();
+    for (int i = 0; i < 10; i++) {
+      urls.add("http://zzz.com/" + i + ".html\tnutch.score=" + i
+          + "\tcustom.attribute=" + i);
+    }
+    int sitemapUrlCnt = 2;
+    for (int i = 10; i < 10 + sitemapUrlCnt; i++) {
+      urls.add("http://zzz.com/" + i + ".html\tnutch.score=" + i
+          + "\tcustom.attribute=" + i
+          + "\t-sitemap");
+    }
+
+    CrawlTestUtil.generateSeedList(fs, urlPath, urls);
+
+    InjectorJob injector = new InjectorJob();
+    injector.setConf(conf);
+    injector.inject(urlPath);
+
+    List<URLWebPage> pages = CrawlTestUtil.readContents(webPageStore, null,
+        fields);
+    ArrayList<String> read = new ArrayList<String>();
+
+    int sitemapCount = 0;
+
+    for (URLWebPage up : pages) {
+      WebPage page = up.getDatum();
+      String representation = up.getUrl();
+      representation += "\tnutch.score=" + page.getScore().intValue();
+      ByteBuffer bb = page.getMetadata().get(new Utf8("custom.attribute"));
+      if (bb != null) {
+        representation += "\tcustom.attribute=" + Bytes.toString(bb);
+      }
+      if (URLFilters.isSitemap(page)) {
+        representation += "\t-sitemap";
+        sitemapCount++;
+      }
+      read.add(representation);
+    }
+
+    Collections.sort(read);
+    Collections.sort(urls);
+
+    assertEquals(urls.size(), read.size());
+
+    assertTrue(urls.containsAll(read));
+    assertTrue(read.containsAll(urls));
+
+    assertEquals(sitemapCount, sitemapUrlCnt);
+
+  }
+
+  /**
+   * Test that injector for multi sitemap urls
+   *
+   * @throws Exception
+   */
+  @Test
+  public void testMultiSitemapInject() throws Exception {
+    ArrayList<String> urls = new ArrayList<String>();
+    for (int i = 0; i < 10; i++) {
+      urls.add("http://zzz" + i + ".com/\tnutch.score=" + i
+          + "\tcustom.attribute=" + i);
+    }
+
+    int sitemapUrlCnt = 2;
+    for (int i = 10; i < 10 + sitemapUrlCnt; i++) {
+      String url = "http://zzz.com/" + i + ".html\tnutch.score=" + i
+          + "\tcustom.attribute=" + i
+          + "\tsitemaps:";
+
+      for (int j = 0; j < sitemapUrlCnt; j++) {
+        url += " sitemap" + j + ".xml";
+      }
+
+      urls.add(url);
+    }
+
+    CrawlTestUtil.generateSeedList(fs, urlPath, urls);
+
+    InjectorJob injector = new InjectorJob();
+    injector.setConf(conf);
+    injector.inject(urlPath);
+
+    List<URLWebPage> pages = CrawlTestUtil.readContents(webPageStore, null,
+        fields);
+    ArrayList<String> read = new ArrayList<String>();
+
+    int sitemapCount = 0;
+
+    for (URLWebPage up : pages) {
+      WebPage page = up.getDatum();
+      if (URLFilters.isSitemap(page)) {
+        sitemapCount++;
+
+      }
+    }
+
+    assertEquals(sitemapCount, sitemapUrlCnt * sitemapUrlCnt);
+    assertEquals(urls.size() + sitemapUrlCnt * sitemapUrlCnt, pages.size());
+
+  }
+
+  /**
+   * Test for Injector
+   *
+   * @throws Exception
+   */
   @Test
   public void testInject() throws Exception {
     ArrayList<String> urls = new ArrayList<String>();
@@ -119,4 +234,10 @@ public class TestInjector extends Abstra
     }
     return read;
   }
+
+  @Override
+  @After
+  public void tearDown() throws Exception {
+    super.tearDown();
+  }
 }

Modified: nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java Tue Jan 26 19:19:02 2016
@@ -20,17 +20,23 @@ import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
+import java.util.Map;
 
 import org.apache.hadoop.fs.Path;
 import org.apache.nutch.crawl.GeneratorJob;
 import org.apache.nutch.crawl.InjectorJob;
 import org.apache.nutch.crawl.URLWebPage;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
 import org.apache.nutch.storage.Mark;
+import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.AbstractNutchTest;
 import org.apache.nutch.util.Bytes;
 import org.apache.nutch.util.CrawlTestUtil;
 import org.mortbay.jetty.Server;
 
+import crawlercommons.robots.BaseRobotRules;
+
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Ignore;
@@ -63,20 +69,37 @@ public class TestFetcher extends Abstrac
   public void tearDown() throws Exception {
     server.stop();
     fs.delete(testdir, true);
+    super.tearDown();
   }
 
+  /**
+   * Test only the normal web page Fetcher
+   *
+   * @throws Exception
+   */
   @Test
   public void testFetch() throws Exception {
 
+    String batchId = "1234";
+    conf.set(GeneratorJob.BATCH_ID, batchId);
+
     // generate seedlist
+    ArrayList<String> normalUrls = new ArrayList<String>();
+    ArrayList<String> sitemapUrls = new ArrayList<String>();
     ArrayList<String> urls = new ArrayList<String>();
 
-    addUrl(urls, "index.html");
-    addUrl(urls, "pagea.html");
-    addUrl(urls, "pageb.html");
-    addUrl(urls, "dup_of_pagea.html");
-    addUrl(urls, "nested_spider_trap.html");
-    addUrl(urls, "exception.html");
+    addUrl(normalUrls, "index.html");
+    addUrl(normalUrls, "pagea.html");
+    addUrl(normalUrls, "pageb.html");
+    addUrl(normalUrls, "dup_of_pagea.html");
+    addUrl(normalUrls, "nested_spider_trap.html");
+    addUrl(normalUrls, "exception.html");
+    addUrl(sitemapUrls, "sitemap1.xml\t-sitemap");
+    addUrl(sitemapUrls, "sitemap2.xml\t-sitemap");
+    addUrl(sitemapUrls, "sitemapIndex.xml\t-sitemap");
+
+    urls.addAll(normalUrls);
+    urls.addAll(sitemapUrls);
 
     CrawlTestUtil.generateSeedList(fs, urlPath, urls);
 
@@ -87,7 +110,10 @@ public class TestFetcher extends Abstrac
     // generate
     long time = System.currentTimeMillis();
     GeneratorJob g = new GeneratorJob(conf);
-    String batchId = g.generate(Long.MAX_VALUE, time, false, false);
+    //  generate for non sitemap
+    g.generate(Long.MAX_VALUE, time, false, false, false);
+    //    generate for only sitemap
+    g.generate(Long.MAX_VALUE, time, false, false, true);
 
     // fetch
     time = System.currentTimeMillis();
@@ -104,7 +130,7 @@ public class TestFetcher extends Abstrac
 
     List<URLWebPage> pages = CrawlTestUtil.readContents(webPageStore,
         Mark.FETCH_MARK, (String[]) null);
-    assertEquals(urls.size(), pages.size());
+    assertEquals(normalUrls.size(), pages.size());
     List<String> handledurls = new ArrayList<String>();
     for (URLWebPage up : pages) {
       ByteBuffer bb = up.getDatum().getContent();
@@ -116,15 +142,149 @@ public class TestFetcher extends Abstrac
         handledurls.add(up.getUrl());
       }
     }
-    Collections.sort(urls);
+    Collections.sort(normalUrls);
+    Collections.sort(handledurls);
+
+    // verify that enough pages were handled
+    assertEquals(normalUrls.size(), handledurls.size());
+
+    // verify that correct pages were handled
+    assertTrue(handledurls.containsAll(normalUrls));
+    assertTrue(normalUrls.containsAll(handledurls));
+  }
+
+  /**
+   * Test that only sitemap page fetcher
+   *
+   * @throws Exception
+   */
+  @Test
+  public void testSitemapFetch() throws Exception {
+    String batchId = "1234";
+    conf.set(GeneratorJob.BATCH_ID, batchId);
+
+    // generate seedlist
+    ArrayList<String> normalUrls = new ArrayList<String>();
+    ArrayList<String> sitemapUrls = new ArrayList<String>();
+    ArrayList<String> urls = new ArrayList<String>();
+
+    addUrl(normalUrls, "index.html");
+    addUrl(normalUrls, "pagea.html");
+    addUrl(normalUrls, "pageb.html");
+    addUrl(normalUrls, "dup_of_pagea.html");
+    addUrl(normalUrls, "nested_spider_trap.html");
+    addUrl(normalUrls, "exception.html");
+    addUrl(sitemapUrls, "sitemap1.xml\t-sitemap");
+    addUrl(sitemapUrls, "sitemap2.xml\t-sitemap");
+    addUrl(sitemapUrls, "sitemapIndex.xml\t-sitemap");
+
+    urls.addAll(normalUrls);
+    urls.addAll(sitemapUrls);
+
+    String[] fields = new String[] {
+        WebPage.Field.MARKERS.getName(), WebPage.Field.SCORE.getName() };
+
+    Path urlPath = new Path(testdir, "urls");
+
+    CrawlTestUtil.generateSeedList(fs, urlPath, urls);
+
+    InjectorJob injector = new InjectorJob();
+    injector.setConf(conf);
+    injector.inject(urlPath);
+
+    // generate
+    long time = System.currentTimeMillis();
+    GeneratorJob g = new GeneratorJob(conf);
+
+    //    generate for non sitemap
+    g.generate(Long.MAX_VALUE, time, false, false, false);
+    //    generate for only sitemap
+    g.generate(Long.MAX_VALUE, time, false, false, true);
+
+    conf.setBoolean(FetcherJob.PARSE_KEY, true);
+    FetcherJob fetcher = new FetcherJob(conf);
+
+    // for only sitemap fetch
+    fetcher.fetch(batchId, 1, false, -1, false, true);
+
+    List<URLWebPage> pages = CrawlTestUtil.readContents(webPageStore,
+        Mark.FETCH_MARK, (String[]) null);
+    assertEquals(sitemapUrls.size(), pages.size());
+    List<String> handledurls = new ArrayList<String>();
+    for (URLWebPage up : pages) {
+      ByteBuffer bb = up.getDatum().getContent();
+      if (bb == null) {
+        continue;
+      }
+      String content = Bytes.toString(bb);
+      if (content.indexOf("sitemap") != -1) {
+        handledurls.add(up.getUrl() + "\t-sitemap");
+      }
+    }
+    Collections.sort(sitemapUrls);
     Collections.sort(handledurls);
 
     // verify that enough pages were handled
-    assertEquals(urls.size(), handledurls.size());
+    assertEquals(sitemapUrls.size(), handledurls.size());
 
     // verify that correct pages were handled
-    assertTrue(handledurls.containsAll(urls));
-    assertTrue(urls.containsAll(handledurls));
+    assertTrue(handledurls.containsAll(sitemapUrls));
+    assertTrue(sitemapUrls.containsAll(handledurls));
+
+  }
+
+  /**
+   * Test that sitemap detection from robot.txt
+   *
+   * @throws Exception
+   */
+  @Test
+  public void testSitemapDetect() throws Exception {
+    String batchId = "1234";
+    conf.set(GeneratorJob.BATCH_ID, batchId);
+
+    // generate seedlist
+    ArrayList<String> urls = new ArrayList<String>();
+
+    addUrl(urls, "");
+
+    String[] fields = new String[] {
+        WebPage.Field.MARKERS.getName(), WebPage.Field.SCORE.getName() };
+
+    Path urlPath = new Path(testdir, "urls");
+
+    CrawlTestUtil.generateSeedList(fs, urlPath, urls);
+
+    InjectorJob injector = new InjectorJob();
+    injector.setConf(conf);
+    injector.inject(urlPath);
+
+    // generate
+    long time = System.currentTimeMillis();
+    GeneratorJob g = new GeneratorJob(conf);
+
+    g.generate(Long.MAX_VALUE, time, false, false, false);
+
+    conf.setBoolean(FetcherJob.PARSE_KEY, true);
+    FetcherJob fetcher = new FetcherJob(conf);
+
+    // for only sitemap fetch
+    fetcher.fetch(batchId, 1, false, -1, true, false);
+
+    List<URLWebPage> pages = CrawlTestUtil.readContents(webPageStore,
+        Mark.FETCH_MARK, (String[]) null);
+    assertEquals(urls.size(), pages.size());
+    for (URLWebPage up : pages) {
+
+      ProtocolFactory protocolFactory = new ProtocolFactory(conf);
+      Protocol protocol = protocolFactory.getProtocol(up.getUrl());
+      BaseRobotRules rules = protocol.getRobotRules(up.getUrl(),
+          up.getDatum());
+
+      Map<CharSequence, CharSequence> sitemaps = up.getDatum().getSitemaps();
+      assertEquals(rules.getSitemaps().size(),
+          sitemaps.size()); // robots.txt file has 3 sitemap urls.
+    }
   }
 
   private void addUrl(ArrayList<String> urls, String page) {

Added: nutch/branches/2.x/src/test/org/apache/nutch/parse/TestSitemapParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/parse/TestSitemapParser.java?rev=1726853&view=auto
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/parse/TestSitemapParser.java (added)
+++ nutch/branches/2.x/src/test/org/apache/nutch/parse/TestSitemapParser.java Tue Jan 26 19:19:02 2016
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.apache.avro.util.Utf8;
+import org.apache.commons.io.IOUtils;
+import org.apache.nutch.crawl.InjectType;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.URLFilterException;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.nutch.storage.Mark;
+import org.apache.nutch.storage.ParseStatus;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.AbstractNutchTest;
+import org.apache.nutch.util.TableUtil;
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.mockito.Mock;
+import org.mockito.stubbing.Answer;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.ByteBuffer;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import static org.junit.Assert.*;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+public class TestSitemapParser extends AbstractNutchTest {
+
+  @Mock
+  WebPage page;
+
+  @Override
+  @Before
+  public void setUp() throws Exception {
+    super.setUp();
+    page =mock(WebPage.class);
+  }
+
+  @Override
+  @After
+  public void tearDown() throws Exception {
+    super.tearDown();
+  }
+
+  @Test
+  public void testSitemapParser() throws Exception {
+    String sitemapUrl = "http://localhost/sitemap.xml";
+    int urlSize = 5;
+    String content = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+        + "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd\">\n\t"
+        + "<url>\n\t\t<loc>http://localhost/zzz1.html</loc>\n\t\t<lastmod>2015-06-10</lastmod>\n\t\t<changefreq>monthly</changefreq>tml\n\t\t<priority>0.8</priority>\n\t</url>\n\t"
+        + "<url>\n\t\t<loc>http://localhost/zzz2.html</loc>\n\t\t<lastmod>2015-06-10</lastmod>\n\t\t<changefreq>monthly</changefreq>\n\t\t<priority>0.8</priority>\n\t</url>\n\t"
+        + "<url>\n\t\t<loc>http://localhost/zzz3.html</loc>\n\t\t<lastmod>2015-06-10</lastmod>\n\t\t<changefreq>monthly</changefreq>\n\t\t<priority>0.8</priority>\n\t</url>\n\t"
+        + "<url>\n\t\t<loc>http://localhost/zzz4.html</loc>\n\t\t<lastmod>2015-06-10</lastmod>\n\t\t<changefreq>monthly</changefreq>\n\t\t<priority>0.8</priority>\n\t</url>\n\t"
+        + "<url>\n\t\t<loc>http://localhost/zzz5.html</loc>\n\t\t<lastmod>2015-06-10</lastmod>\n\t\t<changefreq>monthly</changefreq>\n\t\t<priority>0.8</priority>\n\t</url>\n"
+        + "</urlset>";
+    when(page.getContent()).thenReturn(ByteBuffer.wrap(content.getBytes()));
+    when(page.getContentType()).thenReturn("application/xml");
+
+    NutchSitemapParser sParser = new NutchSitemapParser();
+    NutchSitemapParse nutchSitemapParse = sParser.getParse(sitemapUrl, page);
+
+    assertNotNull(nutchSitemapParse);
+
+    ParseStatus pstatus = nutchSitemapParse.getParseStatus();
+    assertTrue(ParseStatusUtils.isSuccess(pstatus));
+
+    assertEquals(nutchSitemapParse.getOutlinkMap().size(), urlSize);
+  }
+
+  @Test
+  public void testSitemapIndexParser() throws Exception {
+    String sitemapUrl = "http://localhost/sitemapIndex.xml";
+    int urlSize = 3;
+    String content = "<sitemapindex xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n"
+        + "    <sitemap>\n"
+        + "        <loc>http://localhost/sitemap1.xml</loc>\n"
+        + "        <lastmod>2015-07-30</lastmod>\n"
+        + "    </sitemap>\n"
+        + "    <sitemap>\n"
+        + "        <loc>http://localhost/sitemap2.xml</loc>\n"
+        + "        <lastmod>2015-07-30</lastmod>\n"
+        + "    </sitemap>\n"
+        + "    <sitemap>\n"
+        + "        <loc>http://localhost/sitemap3.xml</loc>\n"
+        + "        <lastmod>2015-07-30</lastmod>\n"
+        + "    </sitemap>\n"
+        + "</sitemapindex>";
+
+    when(page.getContent()).thenReturn(ByteBuffer.wrap(content.getBytes()));
+    when(page.getContentType()).thenReturn("application/xml");
+    when(page.getSitemaps()).thenReturn(new HashMap<CharSequence, CharSequence>());
+
+    NutchSitemapParser sParser = new NutchSitemapParser();
+    NutchSitemapParse nutchSitemapParse = sParser.getParse(sitemapUrl, page);
+
+    assertNotNull(nutchSitemapParse);
+    assertNull(nutchSitemapParse.getOutlinkMap());
+
+    ParseStatus pstatus = nutchSitemapParse.getParseStatus();
+    assertTrue(ParseStatusUtils.isSuccess(pstatus));
+
+    assertEquals(page.getSitemaps().size(), urlSize);
+  }
+}

Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java (original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java Tue Jan 26 19:19:02 2016
@@ -23,6 +23,7 @@ import org.apache.hadoop.conf.Configurat
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
+import org.apache.nutch.crawl.GeneratorJob;
 import org.apache.nutch.crawl.URLWebPage;
 import org.apache.nutch.storage.Mark;
 import org.apache.nutch.storage.WebPage;
@@ -97,7 +98,7 @@ public class CrawlTestUtil {
    */
   public static ArrayList<URLWebPage> readContents(
       DataStore<String, WebPage> store, Mark requiredMark, String... fields)
-      throws Exception {
+          throws Exception {
     ArrayList<URLWebPage> l = new ArrayList<URLWebPage>();
 
     Query<String, WebPage> query = store.newQuery();
@@ -145,4 +146,21 @@ public class CrawlTestUtil {
     webServer.setHandler(handlers);
     return webServer;
   }
+
+  /**
+   * Generate Fetchlist.
+   *
+   * @param numResults number of results to generate
+   * @param config     Configuration to use
+   * @return path to generated batch
+   * @throws IOException
+   */
+  public static void generateFetchlist(int numResults, Configuration config,
+      boolean filter, boolean sitemap) throws Exception {
+    // generate batch
+    GeneratorJob g = new GeneratorJob();
+    g.setConf(config);
+    String batchId = g.generate(numResults, System.currentTimeMillis(), filter,
+        false, sitemap);
+  }
 }

Added: nutch/branches/2.x/src/test/org/apache/nutch/util/HelloHandler.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/HelloHandler.java?rev=1726853&view=auto
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/util/HelloHandler.java (added)
+++ nutch/branches/2.x/src/test/org/apache/nutch/util/HelloHandler.java Tue Jan 26 19:19:02 2016
@@ -0,0 +1,22 @@
+package org.apache.nutch.util;
+
+
+import org.mortbay.jetty.Handler;
+import org.mortbay.jetty.Server;
+import org.mortbay.jetty.handler.DefaultHandler;
+import org.mortbay.jetty.handler.HandlerList;
+import org.mortbay.jetty.handler.ResourceHandler;
+
+public class HelloHandler {
+
+    public static void main( String[] args ) throws Exception
+    {
+        Server webServer = new org.mortbay.jetty.Server(55000);
+        ResourceHandler handler = new ResourceHandler();
+        HandlerList handlers = new HandlerList();
+        handler.setResourceBase("build/test/data/fetch-test-site");
+        handlers.setHandlers(new Handler[] { handler, new DefaultHandler() });
+        webServer.setHandler(handlers);
+        webServer.start();
+    }
+}
\ No newline at end of file

Modified: nutch/branches/2.x/src/testresources/fetch-test-site/robots.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/testresources/fetch-test-site/robots.txt?rev=1726853&r1=1726852&r2=1726853&view=diff
==============================================================================
--- nutch/branches/2.x/src/testresources/fetch-test-site/robots.txt (original)
+++ nutch/branches/2.x/src/testresources/fetch-test-site/robots.txt Tue Jan 26 19:19:02 2016
@@ -0,0 +1,18 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+Sitemap: http://localhost/sitemap1.xml
+Sitemap: http://localhost/sitemap2.xml
+Sitemap: http://localhost/sitemap3.xml
\ No newline at end of file

Added: nutch/branches/2.x/src/testresources/fetch-test-site/sitemap1.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/testresources/fetch-test-site/sitemap1.xml?rev=1726853&view=auto
==============================================================================
--- nutch/branches/2.x/src/testresources/fetch-test-site/sitemap1.xml (added)
+++ nutch/branches/2.x/src/testresources/fetch-test-site/sitemap1.xml Tue Jan 26 19:19:02 2016
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+        xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
+    <url>
+        <loc>http://localhost/zzz1.html</loc>
+        <lastmod>2015-06-10</lastmod>
+        <changefreq>monthly</changefreq>tml
+        <priority>0.8</priority>
+    </url>
+    <url>
+        <loc>http://localhost/zzz2.html</loc>
+        <lastmod>2015-06-10</lastmod>
+        <changefreq>monthly</changefreq>
+        <priority>0.8</priority>
+    </url>
+    <url>
+        <loc>http://localhost/zzz3.html</loc>
+        <lastmod>2015-06-10</lastmod>
+        <changefreq>monthly</changefreq>
+        <priority>0.8</priority>
+    </url>
+    <url>
+        <loc>http://localhost/zzz4.html</loc>
+        <lastmod>2015-06-10</lastmod>
+        <changefreq>monthly</changefreq>
+        <priority>0.8</priority>
+    </url>
+    <url>
+        <loc>http://localhost/zzz5.html</loc>
+        <lastmod>2015-06-10</lastmod>
+        <changefreq>monthly</changefreq>
+        <priority>0.8</priority>
+    </url>
+</urlset>

Added: nutch/branches/2.x/src/testresources/fetch-test-site/sitemap2.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/testresources/fetch-test-site/sitemap2.xml?rev=1726853&view=auto
==============================================================================
--- nutch/branches/2.x/src/testresources/fetch-test-site/sitemap2.xml (added)
+++ nutch/branches/2.x/src/testresources/fetch-test-site/sitemap2.xml Tue Jan 26 19:19:02 2016
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
+        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+        xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">
+    <url>
+        <loc>http://localhost/zzz1.html</loc>
+        <lastmod>2015-06-10</lastmod>
+        <changefreq>monthly</changefreq>tml
+        <priority>0.8</priority>
+    </url>
+    <url>
+        <loc>http://localhost/zzz2.html</loc>
+        <lastmod>2015-06-10</lastmod>
+        <changefreq>monthly</changefreq>
+        <priority>0.8</priority>
+        </url>
+    <url>
+        <loc>http://localhost/zzz3.html</loc>
+        <lastmod>2015-06-10</lastmod>
+        <changefreq>monthly</changefreq>
+        <priority>0.8</priority>
+    </url>
+    <url>
+        <loc>http://localhost/zzz4.html</loc>
+        <lastmod>2015-06-10</lastmod>
+        <changefreq>monthly</changefreq>
+        <priority>0.8</priority>
+    </url>
+    <url>
+        <loc>http://localhost/zzz5.html</loc>
+        <lastmod>2015-06-10</lastmod>
+        <changefreq>monthly</changefreq>
+        <priority>0.8</priority>
+    </url>
+</urlset>

Added: nutch/branches/2.x/src/testresources/fetch-test-site/sitemapIndex.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/testresources/fetch-test-site/sitemapIndex.xml?rev=1726853&view=auto
==============================================================================
--- nutch/branches/2.x/src/testresources/fetch-test-site/sitemapIndex.xml (added)
+++ nutch/branches/2.x/src/testresources/fetch-test-site/sitemapIndex.xml Tue Jan 26 19:19:02 2016
@@ -0,0 +1,31 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
+    <sitemap>
+        <loc>http://localhost/sitemapIndex1.xml</loc>
+        <lastmod>2015-07-30</lastmod>
+    </sitemap>
+    <sitemap>
+        <loc>http://localhost/sitemapIndex2.xml</loc>
+        <lastmod>2015-07-30</lastmod>
+    </sitemap>
+    <sitemap>
+         <loc>http://localhost/sitemapIndex3.xml</loc>
+        <lastmod>2015-07-30</lastmod>
+    </sitemap>
+</sitemapindex>