You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/29 06:39:03 UTC

svn commit: r1655526 [6/26] - in /nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/pr...

Modified: nutch/trunk/src/java/org/apache/nutch/metadata/DublinCore.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/DublinCore.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/DublinCore.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/DublinCore.java Thu Jan 29 05:38:59 2015
@@ -16,149 +16,146 @@
  */
 package org.apache.nutch.metadata;
 
-
 /**
  * A collection of Dublin Core metadata names.
- *
- * @see <a href="http://dublincore.org">dublincore.org</a> 
- *
+ * 
+ * @see <a href="http://dublincore.org">dublincore.org</a>
+ * 
  * @author Chris Mattmann
  * @author J&eacute;r&ocirc;me Charron
  */
 public interface DublinCore {
-  
-    
+
   /**
-   * Typically, Format may include the media-type or dimensions of the
-   * resource. Format may be used to determine the software, hardware or other
-   * equipment needed to display or operate the resource. Examples of
-   * dimensions include size and duration. Recommended best practice is to
-   * select a value from a controlled vocabulary (for example, the list of
-   * Internet Media Types [MIME] defining computer media formats).
+   * Typically, Format may include the media-type or dimensions of the resource.
+   * Format may be used to determine the software, hardware or other equipment
+   * needed to display or operate the resource. Examples of dimensions include
+   * size and duration. Recommended best practice is to select a value from a
+   * controlled vocabulary (for example, the list of Internet Media Types [MIME]
+   * defining computer media formats).
    */
   public static final String FORMAT = "format";
-  
+
   /**
-   * Recommended best practice is to identify the resource by means of a
-   * string or number conforming to a formal identification system. Example
-   * formal identification systems include the Uniform Resource Identifier
-   * (URI) (including the Uniform Resource Locator (URL)), the Digital Object
+   * Recommended best practice is to identify the resource by means of a string
+   * or number conforming to a formal identification system. Example formal
+   * identification systems include the Uniform Resource Identifier (URI)
+   * (including the Uniform Resource Locator (URL)), the Digital Object
    * Identifier (DOI) and the International Standard Book Number (ISBN).
    */
   public static final String IDENTIFIER = "identifier";
-  
+
   /**
    * Date on which the resource was changed.
    */
   public static final String MODIFIED = "modified";
-  
+
   /**
    * An entity responsible for making contributions to the content of the
-   * resource. Examples of a Contributor include a person, an organisation, or
-   * a service. Typically, the name of a Contributor should be used to
-   * indicate the entity.
+   * resource. Examples of a Contributor include a person, an organisation, or a
+   * service. Typically, the name of a Contributor should be used to indicate
+   * the entity.
    */
   public static final String CONTRIBUTOR = "contributor";
-  
+
   /**
-   * The extent or scope of the content of the resource. Coverage will
-   * typically include spatial location (a place name or geographic
-   * coordinates), temporal period (a period label, date, or date range) or
-   * jurisdiction (such as a named administrative entity). Recommended best
-   * practice is to select a value from a controlled vocabulary (for example,
-   * the Thesaurus of Geographic Names [TGN]) and that, where appropriate,
-   * named places or time periods be used in preference to numeric identifiers
-   * such as sets of coordinates or date ranges.
+   * The extent or scope of the content of the resource. Coverage will typically
+   * include spatial location (a place name or geographic coordinates), temporal
+   * period (a period label, date, or date range) or jurisdiction (such as a
+   * named administrative entity). Recommended best practice is to select a
+   * value from a controlled vocabulary (for example, the Thesaurus of
+   * Geographic Names [TGN]) and that, where appropriate, named places or time
+   * periods be used in preference to numeric identifiers such as sets of
+   * coordinates or date ranges.
    */
   public static final String COVERAGE = "coverage";
-  
+
   /**
    * An entity primarily responsible for making the content of the resource.
    * Examples of a Creator include a person, an organisation, or a service.
    * Typically, the name of a Creator should be used to indicate the entity.
    */
   public static final String CREATOR = "creator";
-  
+
   /**
    * A date associated with an event in the life cycle of the resource.
-   * Typically, Date will be associated with the creation or availability of
-   * the resource. Recommended best practice for encoding the date value is
-   * defined in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD
-   * format.
+   * Typically, Date will be associated with the creation or availability of the
+   * resource. Recommended best practice for encoding the date value is defined
+   * in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD format.
    */
   public static final String DATE = "date";
-  
+
   /**
    * An account of the content of the resource. Description may include but is
    * not limited to: an abstract, table of contents, reference to a graphical
    * representation of content or a free-text account of the content.
    */
   public static final String DESCRIPTION = "description";
-  
+
   /**
    * A language of the intellectual content of the resource. Recommended best
    * practice is to use RFC 3066 [RFC3066], which, in conjunction with ISO 639
-   * [ISO639], defines two- and three-letter primary language tags with
-   * optional subtags. Examples include "en" or "eng" for English, "akk" for
-   * Akkadian, and "en-GB" for English used in the United Kingdom.
+   * [ISO639], defines two- and three-letter primary language tags with optional
+   * subtags. Examples include "en" or "eng" for English, "akk" for Akkadian,
+   * and "en-GB" for English used in the United Kingdom.
    */
   public static final String LANGUAGE = "language";
-  
+
   /**
    * An entity responsible for making the resource available. Examples of a
    * Publisher include a person, an organisation, or a service. Typically, the
    * name of a Publisher should be used to indicate the entity.
    */
   public static final String PUBLISHER = "publisher";
-  
+
   /**
    * A reference to a related resource. Recommended best practice is to
    * reference the resource by means of a string or number conforming to a
    * formal identification system.
    */
   public static final String RELATION = "relation";
-  
+
   /**
-   * Information about rights held in and over the resource. Typically, a
-   * Rights element will contain a rights management statement for the
-   * resource, or reference a service providing such information. Rights
-   * information often encompasses Intellectual Property Rights (IPR),
-   * Copyright, and various Property Rights. If the Rights element is absent,
-   * no assumptions can be made about the status of these and other rights
-   * with respect to the resource.
+   * Information about rights held in and over the resource. Typically, a Rights
+   * element will contain a rights management statement for the resource, or
+   * reference a service providing such information. Rights information often
+   * encompasses Intellectual Property Rights (IPR), Copyright, and various
+   * Property Rights. If the Rights element is absent, no assumptions can be
+   * made about the status of these and other rights with respect to the
+   * resource.
    */
   public static final String RIGHTS = "rights";
-  
+
   /**
    * A reference to a resource from which the present resource is derived. The
    * present resource may be derived from the Source resource in whole or in
-   * part. Recommended best practice is to reference the resource by means of
-   * a string or number conforming to a formal identification system.
+   * part. Recommended best practice is to reference the resource by means of a
+   * string or number conforming to a formal identification system.
    */
   public static final String SOURCE = "source";
-  
+
   /**
    * The topic of the content of the resource. Typically, a Subject will be
-   * expressed as keywords, key phrases or classification codes that describe
-   * a topic of the resource. Recommended best practice is to select a value
-   * from a controlled vocabulary or formal classification scheme.
+   * expressed as keywords, key phrases or classification codes that describe a
+   * topic of the resource. Recommended best practice is to select a value from
+   * a controlled vocabulary or formal classification scheme.
    */
   public static final String SUBJECT = "subject";
-  
+
   /**
    * A name given to the resource. Typically, a Title will be a name by which
    * the resource is formally known.
    */
   public static final String TITLE = "title";
-  
+
   /**
    * The nature or genre of the content of the resource. Type includes terms
-   * describing general categories, functions, genres, or aggregation levels
-   * for content. Recommended best practice is to select a value from a
-   * controlled vocabulary (for example, the DCMI Type Vocabulary [DCMITYPE]).
-   * To describe the physical or digital manifestation of the resource, use
-   * the Format element.
+   * describing general categories, functions, genres, or aggregation levels for
+   * content. Recommended best practice is to select a value from a controlled
+   * vocabulary (for example, the DCMI Type Vocabulary [DCMITYPE]). To describe
+   * the physical or digital manifestation of the resource, use the Format
+   * element.
    */
   public static final String TYPE = "type";
-  
+
 }

Modified: nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java Thu Jan 29 05:38:59 2015
@@ -20,32 +20,32 @@ import org.apache.hadoop.io.Text;
 
 /**
  * A collection of HTTP header names.
- *
- * @see <a href="http://rfc-ref.org/RFC-TEXTS/2616/">Hypertext Transfer
- *      Protocol -- HTTP/1.1 (RFC 2616)</a>
+ * 
+ * @see <a href="http://rfc-ref.org/RFC-TEXTS/2616/">Hypertext Transfer Protocol
+ *      -- HTTP/1.1 (RFC 2616)</a>
  */
 public interface HttpHeaders {
 
   public final static String TRANSFER_ENCODING = "Transfer-Encoding";
-	
+
   public final static String CONTENT_ENCODING = "Content-Encoding";
-  
+
   public final static String CONTENT_LANGUAGE = "Content-Language";
 
   public final static String CONTENT_LENGTH = "Content-Length";
-  
+
   public final static String CONTENT_LOCATION = "Content-Location";
-  
+
   public static final String CONTENT_DISPOSITION = "Content-Disposition";
 
   public final static String CONTENT_MD5 = "Content-MD5";
-  
+
   public final static String CONTENT_TYPE = "Content-Type";
 
   public static final Text WRITABLE_CONTENT_TYPE = new Text(CONTENT_TYPE);
-  
+
   public final static String LAST_MODIFIED = "Last-Modified";
-  
+
   public final static String LOCATION = "Location";
 
 }

Modified: nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java Thu Jan 29 05:38:59 2015
@@ -28,28 +28,29 @@ import org.apache.nutch.crawl.NutchWrita
 /**
  * This is a simple decorator that adds metadata to any Writable-s that can be
  * serialized by <tt>NutchWritable</tt>. This is useful when data needs to be
- * temporarily enriched during processing, but this
- * temporary metadata doesn't need to be permanently stored after the job is done.
+ * temporarily enriched during processing, but this temporary metadata doesn't
+ * need to be permanently stored after the job is done.
  * 
  * @author Andrzej Bialecki
  */
 public class MetaWrapper extends NutchWritable {
   private Metadata metadata;
-  
+
   public MetaWrapper() {
     super();
     metadata = new Metadata();
   }
-  
+
   public MetaWrapper(Writable instance, Configuration conf) {
     super(instance);
     metadata = new Metadata();
     setConf(conf);
   }
-  
+
   public MetaWrapper(Metadata metadata, Writable instance, Configuration conf) {
     super(instance);
-    if (metadata == null) metadata = new Metadata();
+    if (metadata == null)
+      metadata = new Metadata();
     this.metadata = metadata;
     setConf(conf);
   }
@@ -60,43 +61,52 @@ public class MetaWrapper extends NutchWr
   public Metadata getMetadata() {
     return metadata;
   }
-  
+
   /**
-   * Add metadata. See {@link Metadata#add(String, String)} for more information.
-   * @param name metadata name
-   * @param value metadata value
+   * Add metadata. See {@link Metadata#add(String, String)} for more
+   * information.
+   * 
+   * @param name
+   *          metadata name
+   * @param value
+   *          metadata value
    */
   public void addMeta(String name, String value) {
     metadata.add(name, value);
   }
-  
+
   /**
-   * Set metadata. See {@link Metadata#set(String, String)} for more information.
+   * Set metadata. See {@link Metadata#set(String, String)} for more
+   * information.
+   * 
    * @param name
    * @param value
    */
   public void setMeta(String name, String value) {
     metadata.set(name, value);
   }
-  
+
   /**
    * Get metadata. See {@link Metadata#get(String)} for more information.
+   * 
    * @param name
    * @return metadata value
    */
   public String getMeta(String name) {
     return metadata.get(name);
   }
-  
+
   /**
-   * Get multiple metadata. See {@link Metadata#getValues(String)} for more information.
+   * Get multiple metadata. See {@link Metadata#getValues(String)} for more
+   * information.
+   * 
    * @param name
    * @return multiple values
    */
   public String[] getMetaValues(String name) {
     return metadata.getValues(name);
   }
-  
+
   public void readFields(DataInput in) throws IOException {
     super.readFields(in);
     metadata = new Metadata();

Modified: nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java Thu Jan 29 05:38:59 2015
@@ -30,15 +30,14 @@ import org.apache.hadoop.io.Writable;
 /**
  * A multi-valued metadata container.
  */
-public class Metadata implements Writable, CreativeCommons,
-DublinCore, HttpHeaders, Nutch, Feed {
+public class Metadata implements Writable, CreativeCommons, DublinCore,
+    HttpHeaders, Nutch, Feed {
 
   /**
    * A map of all metadata attributes.
    */
   private Map<String, String[]> metadata = null;
 
-
   /**
    * Constructs a new, empty metadata.
    */
@@ -48,9 +47,10 @@ DublinCore, HttpHeaders, Nutch, Feed {
 
   /**
    * Returns true if named value is multivalued.
-   * @param name name of metadata
-   * @return true is named value is multivalued, false if single
-   * value or null
+   * 
+   * @param name
+   *          name of metadata
+   * @return true is named value is multivalued, false if single value or null
    */
   public boolean isMultiValued(final String name) {
     return metadata.get(name) != null && metadata.get(name).length > 1;
@@ -58,6 +58,7 @@ DublinCore, HttpHeaders, Nutch, Feed {
 
   /**
    * Returns an array of the names contained in the metadata.
+   * 
    * @return Metadata names
    */
   public String[] names() {
@@ -65,11 +66,11 @@ DublinCore, HttpHeaders, Nutch, Feed {
   }
 
   /**
-   * Get the value associated to a metadata name.
-   * If many values are assiociated to the specified name, then the first
-   * one is returned.
-   *
-   * @param name of the metadata.
+   * Get the value associated to a metadata name. If many values are assiociated
+   * to the specified name, then the first one is returned.
+   * 
+   * @param name
+   *          of the metadata.
    * @return the value associated to the specified metadata name.
    */
   public String get(final String name) {
@@ -83,13 +84,15 @@ DublinCore, HttpHeaders, Nutch, Feed {
 
   /**
    * Get the values associated to a metadata name.
-   * @param name of the metadata.
+   * 
+   * @param name
+   *          of the metadata.
    * @return the values associated to a metadata name.
    */
   public String[] getValues(final String name) {
     return _getValues(name);
   }
-  
+
   private String[] _getValues(final String name) {
     String[] values = metadata.get(name);
     if (values == null) {
@@ -99,12 +102,13 @@ DublinCore, HttpHeaders, Nutch, Feed {
   }
 
   /**
-   * Add a metadata name/value mapping.
-   * Add the specified value to the list of values associated to the
-   * specified metadata name.
-   *
-   * @param name the metadata name.
-   * @param value the metadata value.
+   * Add a metadata name/value mapping. Add the specified value to the list of
+   * values associated to the specified metadata name.
+   * 
+   * @param name
+   *          the metadata name.
+   * @param value
+   *          the metadata value.
    */
   public void add(final String name, final String value) {
     String[] values = metadata.get(name);
@@ -120,31 +124,37 @@ DublinCore, HttpHeaders, Nutch, Feed {
 
   /**
    * Copy All key-value pairs from properties.
-   * @param properties properties to copy from
+   * 
+   * @param properties
+   *          properties to copy from
    */
   public void setAll(Properties properties) {
     Enumeration<?> names = properties.propertyNames();
     while (names.hasMoreElements()) {
       String name = (String) names.nextElement();
-      metadata.put(name, new String[]{properties.getProperty(name)});
+      metadata.put(name, new String[] { properties.getProperty(name) });
     }
   }
 
   /**
-   * Set metadata name/value.
-   * Associate the specified value to the specified metadata name. If some
-   * previous values were associated to this name, they are removed.
-   *
-   * @param name the metadata name.
-   * @param value the metadata value.
+   * Set metadata name/value. Associate the specified value to the specified
+   * metadata name. If some previous values were associated to this name, they
+   * are removed.
+   * 
+   * @param name
+   *          the metadata name.
+   * @param value
+   *          the metadata value.
    */
   public void set(String name, String value) {
-    metadata.put(name, new String[]{value});
+    metadata.put(name, new String[] { value });
   }
 
   /**
    * Remove a metadata and all its associated values.
-   * @param name metadata name to remove
+   * 
+   * @param name
+   *          metadata name to remove
    */
   public void remove(String name) {
     metadata.remove(name);
@@ -152,12 +162,13 @@ DublinCore, HttpHeaders, Nutch, Feed {
 
   /**
    * Returns the number of metadata names in this metadata.
+   * 
    * @return number of metadata names
    */
   public int size() {
     return metadata.size();
   }
-  
+
   /** Remove all mappings from metadata. */
   public void clear() {
     metadata.clear();
@@ -165,7 +176,9 @@ DublinCore, HttpHeaders, Nutch, Feed {
 
   public boolean equals(Object o) {
 
-    if (o == null) { return false; }
+    if (o == null) {
+      return false;
+    }
 
     Metadata other = null;
     try {
@@ -174,7 +187,9 @@ DublinCore, HttpHeaders, Nutch, Feed {
       return false;
     }
 
-    if (other.size() != size()) { return false; }
+    if (other.size() != size()) {
+      return false;
+    }
 
     String[] names = names();
     for (int i = 0; i < names.length; i++) {
@@ -198,10 +213,7 @@ DublinCore, HttpHeaders, Nutch, Feed {
     for (int i = 0; i < names.length; i++) {
       String[] values = _getValues(names[i]);
       for (int j = 0; j < values.length; j++) {
-        buf.append(names[i])
-           .append("=")
-           .append(values[j])
-           .append(" ");
+        buf.append(names[i]).append("=").append(values[j]).append(" ");
       }
     }
     return buf.toString();

Modified: nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Thu Jan 29 05:38:59 2015
@@ -18,20 +18,17 @@ package org.apache.nutch.metadata;
 
 import org.apache.hadoop.io.Text;
 
-
 /**
  * A collection of Nutch internal metadata constants.
- *
+ * 
  * @author Chris Mattmann
  * @author J&eacute;r&ocirc;me Charron
  */
 public interface Nutch {
-  
-  public static final String ORIGINAL_CHAR_ENCODING =
-          "OriginalCharEncoding";
-  
-  public static final String CHAR_ENCODING_FOR_CONVERSION =
-          "CharEncodingForConversion";
+
+  public static final String ORIGINAL_CHAR_ENCODING = "OriginalCharEncoding";
+
+  public static final String CHAR_ENCODING_FOR_CONVERSION = "CharEncodingForConversion";
 
   public static final String SIGNATURE_KEY = "nutch.content.digest";
 
@@ -41,17 +38,22 @@ public interface Nutch {
 
   public static final String GENERATE_TIME_KEY = "_ngt_";
 
-  public static final Text WRITABLE_GENERATE_TIME_KEY = new Text(GENERATE_TIME_KEY);
+  public static final Text WRITABLE_GENERATE_TIME_KEY = new Text(
+      GENERATE_TIME_KEY);
 
   public static final String PROTO_STATUS_KEY = "_pst_";
 
-  public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(PROTO_STATUS_KEY);
-  
+  public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(
+      PROTO_STATUS_KEY);
+
   public static final String FETCH_TIME_KEY = "_ftk_";
-  
+
   public static final String FETCH_STATUS_KEY = "_fst_";
 
-  /** Sites may request that search engines don't provide access to cached documents. */
+  /**
+   * Sites may request that search engines don't provide access to cached
+   * documents.
+   */
   public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden";
 
   /** Show both original forbidden content and summaries (default). */
@@ -70,5 +72,6 @@ public interface Nutch {
   /** Used by AdaptiveFetchSchedule to maintain custom fetch interval */
   public static final String FIXED_INTERVAL_KEY = "fixedInterval";
 
-  public static final Text WRITABLE_FIXED_INTERVAL_KEY = new Text(FIXED_INTERVAL_KEY);
+  public static final Text WRITABLE_FIXED_INTERVAL_KEY = new Text(
+      FIXED_INTERVAL_KEY);
 }

Modified: nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java Thu Jan 29 05:38:59 2015
@@ -33,7 +33,7 @@ public class SpellCheckedMetadata extend
 
   /**
    * Treshold divider.
-   *
+   * 
    * <code>threshold = searched.length() / TRESHOLD_DIVIDER;</code>
    */
   private static final int TRESHOLD_DIVIDER = 3;
@@ -52,7 +52,7 @@ public class SpellCheckedMetadata extend
 
     // Uses following array to fill the metanames index and the
     // metanames list.
-    Class<?>[] spellthese = {HttpHeaders.class};
+    Class<?>[] spellthese = { HttpHeaders.class };
 
     for (Class<?> spellCheckedNames : spellthese) {
       for (Field field : spellCheckedNames.getFields()) {
@@ -73,7 +73,7 @@ public class SpellCheckedMetadata extend
 
   /**
    * Normalizes String.
-   *
+   * 
    * @param str
    *          the string to normalize
    * @return normalized String
@@ -102,7 +102,7 @@ public class SpellCheckedMetadata extend
    * </ul>
    * If no matching with a well-known metadata name is found, then the original
    * name is returned.
-   *
+   * 
    * @param name
    *          Name to normalize
    * @return normalized name

Modified: nutch/trunk/src/java/org/apache/nutch/net/URLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/URLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/URLFilter.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/URLFilter.java Thu Jan 29 05:38:59 2015
@@ -23,17 +23,18 @@ import org.apache.hadoop.conf.Configurab
 // Nutch imports
 import org.apache.nutch.plugin.Pluggable;
 
-
 /**
- * Interface used to limit which URLs enter Nutch.
- * Used by the injector and the db updater.
+ * Interface used to limit which URLs enter Nutch. Used by the injector and the
+ * db updater.
  */
 
 public interface URLFilter extends Pluggable, Configurable {
   /** The name of the extension point. */
   public final static String X_POINT_ID = URLFilter.class.getName();
 
-  /* Interface for a filter that transforms a URL: it can pass the
-     original URL through or "delete" the URL by returning null */
+  /*
+   * Interface for a filter that transforms a URL: it can pass the original URL
+   * through or "delete" the URL by returning null
+   */
   public String filter(String urlString);
 }

Modified: nutch/trunk/src/java/org/apache/nutch/net/URLFilterChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/URLFilterChecker.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/URLFilterChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/URLFilterChecker.java Thu Jan 29 05:38:59 2015
@@ -38,23 +38,23 @@ public class URLFilterChecker {
   private Configuration conf;
 
   public URLFilterChecker(Configuration conf) {
-      this.conf = conf;
+    this.conf = conf;
   }
 
   private void checkOne(String filterName) throws Exception {
     URLFilter filter = null;
 
-    ExtensionPoint point =
-      PluginRepository.get(conf).getExtensionPoint(URLFilter.X_POINT_ID);
+    ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+        URLFilter.X_POINT_ID);
 
     if (point == null)
-      throw new RuntimeException(URLFilter.X_POINT_ID+" not found.");
+      throw new RuntimeException(URLFilter.X_POINT_ID + " not found.");
 
     Extension[] extensions = point.getExtensions();
 
     for (int i = 0; i < extensions.length; i++) {
       Extension extension = extensions[i];
-      filter = (URLFilter)extension.getExtensionInstance();
+      filter = (URLFilter) extension.getExtensionInstance();
       if (filter.getClass().getName().equals(filterName)) {
         break;
       } else {
@@ -63,19 +63,19 @@ public class URLFilterChecker {
     }
 
     if (filter == null)
-      throw new RuntimeException("Filter "+filterName+" not found.");
+      throw new RuntimeException("Filter " + filterName + " not found.");
 
     // jerome : should we keep this behavior?
-    //if (LogFormatter.hasLoggedSevere())
-    //  throw new RuntimeException("Severe error encountered.");
+    // if (LogFormatter.hasLoggedSevere())
+    // throw new RuntimeException("Severe error encountered.");
 
-    System.out.println("Checking URLFilter "+filterName);
+    System.out.println("Checking URLFilter " + filterName);
 
     BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
     String line;
-    while((line=in.readLine())!=null) {
-      String out=filter.filter(line);
-      if(out!=null) {
+    while ((line = in.readLine()) != null) {
+      String out = filter.filter(line);
+      if (out != null) {
         System.out.print("+");
         System.out.println(out);
       } else {
@@ -90,10 +90,10 @@ public class URLFilterChecker {
 
     BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
     String line;
-    while((line=in.readLine())!=null) {
+    while ((line = in.readLine()) != null) {
       URLFilters filters = new URLFilters(this.conf);
       String out = filters.filter(line);
-      if(out!=null) {
+      if (out != null) {
         System.out.print("+");
         System.out.println(out);
       } else {
@@ -105,8 +105,8 @@ public class URLFilterChecker {
 
   public static void main(String[] args) throws Exception {
 
-    String usage = "Usage: URLFilterChecker (-filterName filterName | -allCombined) \n" 
-	+ "Tool takes a list of URLs, one per line, passed via STDIN.\n";
+    String usage = "Usage: URLFilterChecker (-filterName filterName | -allCombined) \n"
+        + "Tool takes a list of URLs, one per line, passed via STDIN.\n";
 
     if (args.length == 0) {
       System.err.println(usage);

Modified: nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java Thu Jan 29 05:38:59 2015
@@ -20,16 +20,15 @@ package org.apache.nutch.net;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.plugin.PluginRepository;
 
-/** Creates and caches {@link URLFilter} implementing plugins.*/
+/** Creates and caches {@link URLFilter} implementing plugins. */
 public class URLFilters {
 
   public static final String URLFILTER_ORDER = "urlfilter.order";
   private URLFilter[] filters;
 
   public URLFilters(Configuration conf) {
-    this.filters = (URLFilter[]) PluginRepository.get(conf)
-        .getOrderedPlugins(URLFilter.class, URLFilter.X_POINT_ID,
-            URLFILTER_ORDER);
+    this.filters = (URLFilter[]) PluginRepository.get(conf).getOrderedPlugins(
+        URLFilter.class, URLFilter.X_POINT_ID, URLFILTER_ORDER);
   }
 
   /** Run all defined filters. Assume logical AND. */

Modified: nutch/trunk/src/java/org/apache/nutch/net/URLNormalizer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/URLNormalizer.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/URLNormalizer.java Thu Jan 29 05:38:59 2015
@@ -21,13 +21,17 @@ import java.net.MalformedURLException;
 
 import org.apache.hadoop.conf.Configurable;
 
-/** Interface used to convert URLs to normal form and optionally perform substitutions */
+/**
+ * Interface used to convert URLs to normal form and optionally perform
+ * substitutions
+ */
 public interface URLNormalizer extends Configurable {
-  
+
   /* Extension ID */
   public static final String X_POINT_ID = URLNormalizer.class.getName();
-  
+
   /* Interface for URL normalization */
-  public String normalize(String urlString, String scope) throws MalformedURLException;
+  public String normalize(String urlString, String scope)
+      throws MalformedURLException;
 
 }

Modified: nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java Thu Jan 29 05:38:59 2015
@@ -36,23 +36,23 @@ public class URLNormalizerChecker {
   private Configuration conf;
 
   public URLNormalizerChecker(Configuration conf) {
-      this.conf = conf;
+    this.conf = conf;
   }
 
   private void checkOne(String normalizerName, String scope) throws Exception {
     URLNormalizer normalizer = null;
 
-    ExtensionPoint point =
-      PluginRepository.get(conf).getExtensionPoint(URLNormalizer.X_POINT_ID);
+    ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+        URLNormalizer.X_POINT_ID);
 
     if (point == null)
-      throw new RuntimeException(URLNormalizer.X_POINT_ID+" not found.");
+      throw new RuntimeException(URLNormalizer.X_POINT_ID + " not found.");
 
     Extension[] extensions = point.getExtensions();
 
     for (int i = 0; i < extensions.length; i++) {
       Extension extension = extensions[i];
-      normalizer = (URLNormalizer)extension.getExtensionInstance();
+      normalizer = (URLNormalizer) extension.getExtensionInstance();
       if (normalizer.getClass().getName().equals(normalizerName)) {
         break;
       } else {
@@ -61,7 +61,8 @@ public class URLNormalizerChecker {
     }
 
     if (normalizer == null)
-      throw new RuntimeException("URLNormalizer "+normalizerName+" not found.");
+      throw new RuntimeException("URLNormalizer " + normalizerName
+          + " not found.");
 
     System.out.println("Checking URLNormalizer " + normalizerName);
 
@@ -79,7 +80,7 @@ public class URLNormalizerChecker {
     BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
     String line;
     URLNormalizers normalizers = new URLNormalizers(conf, scope);
-    while((line = in.readLine()) != null) {
+    while ((line = in.readLine()) != null) {
       String out = normalizers.normalize(line, scope);
       System.out.println(out);
     }
@@ -88,7 +89,7 @@ public class URLNormalizerChecker {
   public static void main(String[] args) throws Exception {
 
     String usage = "Usage: URLNormalizerChecker [-normalizer <normalizerName>] [-scope <scope>]"
-      + "\n\tscope can be one of: default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink";
+        + "\n\tscope can be one of: default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink";
 
     String normalizerName = null;
     String scope = URLNormalizers.SCOPE_DEFAULT;
@@ -103,7 +104,8 @@ public class URLNormalizerChecker {
       }
     }
 
-    URLNormalizerChecker checker = new URLNormalizerChecker(NutchConfiguration.create());
+    URLNormalizerChecker checker = new URLNormalizerChecker(
+        NutchConfiguration.create());
     if (normalizerName != null) {
       checker.checkOne(normalizerName, scope);
     } else {

Modified: nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java Thu Jan 29 05:38:59 2015
@@ -43,47 +43,63 @@ import org.apache.nutch.util.ObjectCache
  * contexts where they are used (note however that they need to be activated
  * first through <tt>plugin.include</tt> property).
  * 
- * <p>There is one global scope defined by default, which consists of all
- * active normalizers. The order in which these normalizers
- * are executed may be defined in "urlnormalizer.order" property, which lists
- * space-separated implementation classes (if this property is missing normalizers
- * will be run in random order). If there are more
- * normalizers activated than explicitly named on this list, the remaining ones
- * will be run in random order after the ones specified on the list are executed.</p>
- * <p>You can define a set of contexts (or scopes) in which normalizers may be
+ * <p>
+ * There is one global scope defined by default, which consists of all active
+ * normalizers. The order in which these normalizers are executed may be defined
+ * in "urlnormalizer.order" property, which lists space-separated implementation
+ * classes (if this property is missing normalizers will be run in random
+ * order). If there are more normalizers activated than explicitly named on this
+ * list, the remaining ones will be run in random order after the ones specified
+ * on the list are executed.
+ * </p>
+ * <p>
+ * You can define a set of contexts (or scopes) in which normalizers may be
  * called. Each scope can have its own list of normalizers (defined in
  * "urlnormalizer.scope.<scope_name>" property) and its own order (defined in
  * "urlnormalizer.order.<scope_name>" property). If any of these properties are
- * missing, default settings are used for the global scope.</p>
- * <p>In case no normalizers are required for any given scope, a
- * <code>org.apache.nutch.net.urlnormalizer.pass.PassURLNormalizer</code> should be used.</p>
- * <p>Each normalizer may further select among many configurations, depending on
- * the scope in which it is called, because the scope name is passed as a parameter
- * to each normalizer. You can also use the same normalizer for many scopes.</p>
- * <p>Several scopes have been defined, and various Nutch tools will attempt using
- * scope-specific normalizers first (and fall back to default config if scope-specific
- * configuration is missing).</p>
- * <p>Normalizers may be run several times, to ensure that modifications introduced
+ * missing, default settings are used for the global scope.
+ * </p>
+ * <p>
+ * In case no normalizers are required for any given scope, a
+ * <code>org.apache.nutch.net.urlnormalizer.pass.PassURLNormalizer</code> should
+ * be used.
+ * </p>
+ * <p>
+ * Each normalizer may further select among many configurations, depending on
+ * the scope in which it is called, because the scope name is passed as a
+ * parameter to each normalizer. You can also use the same normalizer for many
+ * scopes.
+ * </p>
+ * <p>
+ * Several scopes have been defined, and various Nutch tools will attempt using
+ * scope-specific normalizers first (and fall back to default config if
+ * scope-specific configuration is missing).
+ * </p>
+ * <p>
+ * Normalizers may be run several times, to ensure that modifications introduced
  * by normalizers at the end of the list can be further reduced by normalizers
- * executed at the beginning. By default this loop is executed just once - if you want
- * to ensure that all possible combinations have been applied you may want to run
- * this loop up to the number of activated normalizers. This loop count can be configured
- * through <tt>urlnormalizer.loop.count</tt> property. As soon as the url is
- * unchanged the loop will stop and return the result.</p>
+ * executed at the beginning. By default this loop is executed just once - if
+ * you want to ensure that all possible combinations have been applied you may
+ * want to run this loop up to the number of activated normalizers. This loop
+ * count can be configured through <tt>urlnormalizer.loop.count</tt> property.
+ * As soon as the url is unchanged the loop will stop and return the result.
+ * </p>
  * 
  * @author Andrzej Bialecki
  */
 public final class URLNormalizers {
-  
-  /** Default scope. If no scope properties are defined then the configuration for
-   * this scope will be used.
+
+  /**
+   * Default scope. If no scope properties are defined then the configuration
+   * for this scope will be used.
    */
   public static final String SCOPE_DEFAULT = "default";
   /** Scope used by {@link org.apache.nutch.crawl.URLPartitioner}. */
   public static final String SCOPE_PARTITION = "partition";
   /** Scope used by {@link org.apache.nutch.crawl.Generator}. */
   public static final String SCOPE_GENERATE_HOST_COUNT = "generate_host_count";
-  /** Scope used by {@link org.apache.nutch.fetcher.Fetcher} when processing
+  /**
+   * Scope used by {@link org.apache.nutch.fetcher.Fetcher} when processing
    * redirect URLs.
    */
   public static final String SCOPE_FETCHER = "fetcher";
@@ -93,16 +109,21 @@ public final class URLNormalizers {
   public static final String SCOPE_LINKDB = "linkdb";
   /** Scope used by {@link org.apache.nutch.crawl.Injector}. */
   public static final String SCOPE_INJECT = "inject";
-  /** Scope used when constructing new {@link org.apache.nutch.parse.Outlink} instances. */
+  /**
+   * Scope used when constructing new {@link org.apache.nutch.parse.Outlink}
+   * instances.
+   */
   public static final String SCOPE_OUTLINK = "outlink";
   /** Scope used when indexing URLs. */
   public static final String SCOPE_INDEXER = "indexer";
 
-  public static final Logger LOG = LoggerFactory.getLogger(URLNormalizers.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(URLNormalizers.class);
 
   /* Empty extension list for caching purposes. */
-  private final List<Extension> EMPTY_EXTENSION_LIST = Collections.<Extension>emptyList();
-  
+  private final List<Extension> EMPTY_EXTENSION_LIST = Collections
+      .<Extension> emptyList();
+
   private final URLNormalizer[] EMPTY_NORMALIZERS = new URLNormalizer[0];
 
   private Configuration conf;
@@ -110,37 +131,39 @@ public final class URLNormalizers {
   private ExtensionPoint extensionPoint;
 
   private URLNormalizer[] normalizers;
-  
+
   private int loopCount;
 
   public URLNormalizers(Configuration conf, String scope) {
     this.conf = conf;
     this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(
-            URLNormalizer.X_POINT_ID);
+        URLNormalizer.X_POINT_ID);
     ObjectCache objectCache = ObjectCache.get(conf);
-    
+
     if (this.extensionPoint == null) {
       throw new RuntimeException("x point " + URLNormalizer.X_POINT_ID
-              + " not found.");
+          + " not found.");
     }
 
-    normalizers = (URLNormalizer[])objectCache.getObject(URLNormalizer.X_POINT_ID + "_" + scope);
+    normalizers = (URLNormalizer[]) objectCache
+        .getObject(URLNormalizer.X_POINT_ID + "_" + scope);
     if (normalizers == null) {
       normalizers = getURLNormalizers(scope);
     }
     if (normalizers == EMPTY_NORMALIZERS) {
-      normalizers = (URLNormalizer[])objectCache.getObject(URLNormalizer.X_POINT_ID + "_" + SCOPE_DEFAULT);
+      normalizers = (URLNormalizer[]) objectCache
+          .getObject(URLNormalizer.X_POINT_ID + "_" + SCOPE_DEFAULT);
       if (normalizers == null) {
         normalizers = getURLNormalizers(SCOPE_DEFAULT);
       }
     }
-    
+
     loopCount = conf.getInt("urlnormalizer.loop.count", 1);
   }
 
   /**
-   * Function returns an array of {@link URLNormalizer}s for a given scope,
-   * with a specified order.
+   * Function returns an array of {@link URLNormalizer}s for a given scope, with
+   * a specified order.
    * 
    * @param scope
    *          The scope to return the <code>Array</code> of
@@ -152,12 +175,13 @@ public final class URLNormalizers {
   URLNormalizer[] getURLNormalizers(String scope) {
     List<Extension> extensions = getExtensions(scope);
     ObjectCache objectCache = ObjectCache.get(conf);
-    
+
     if (extensions == EMPTY_EXTENSION_LIST) {
       return EMPTY_NORMALIZERS;
     }
-    
-    List<URLNormalizer> normalizers = new Vector<URLNormalizer>(extensions.size());
+
+    List<URLNormalizer> normalizers = new Vector<URLNormalizer>(
+        extensions.size());
 
     Iterator<Extension> it = extensions.iterator();
     while (it.hasNext()) {
@@ -175,14 +199,13 @@ public final class URLNormalizers {
       } catch (PluginRuntimeException e) {
         e.printStackTrace();
         LOG.warn("URLNormalizers:PluginRuntimeException when "
-                + "initializing url normalizer plugin "
-                + ext.getDescriptor().getPluginId()
-                + " instance in getURLNormalizers "
-                + "function: attempting to continue instantiating plugins");
+            + "initializing url normalizer plugin "
+            + ext.getDescriptor().getPluginId()
+            + " instance in getURLNormalizers "
+            + "function: attempting to continue instantiating plugins");
       }
     }
-    return normalizers.toArray(new URLNormalizer[normalizers
-            .size()]);
+    return normalizers.toArray(new URLNormalizer[normalizers.size()]);
   }
 
   /**
@@ -197,9 +220,8 @@ public final class URLNormalizers {
   @SuppressWarnings("unchecked")
   private List<Extension> getExtensions(String scope) {
     ObjectCache objectCache = ObjectCache.get(conf);
-    List<Extension> extensions = 
-      (List<Extension>) objectCache.getObject(URLNormalizer.X_POINT_ID + "_x_"
-                                                + scope);
+    List<Extension> extensions = (List<Extension>) objectCache
+        .getObject(URLNormalizer.X_POINT_ID + "_x_" + scope);
 
     // Just compare the reference:
     // if this is the empty list, we know we will find no extension.
@@ -210,11 +232,13 @@ public final class URLNormalizers {
     if (extensions == null) {
       extensions = findExtensions(scope);
       if (extensions != null) {
-        objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope, extensions);
+        objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope,
+            extensions);
       } else {
         // Put the empty extension list into cache
         // to remember we don't know any related extension.
-        objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope, EMPTY_EXTENSION_LIST);
+        objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope,
+            EMPTY_EXTENSION_LIST);
         extensions = EMPTY_EXTENSION_LIST;
       }
     }
@@ -234,7 +258,8 @@ public final class URLNormalizers {
 
     String[] orders = null;
     String orderlist = conf.get("urlnormalizer.order." + scope);
-    if (orderlist == null) orderlist = conf.get("urlnormalizer.order");
+    if (orderlist == null)
+      orderlist = conf.get("urlnormalizer.order");
     if (orderlist != null && !orderlist.trim().equals("")) {
       orders = orderlist.trim().split("\\s+");
     }
@@ -272,13 +297,17 @@ public final class URLNormalizers {
 
   /**
    * Normalize
-   * @param urlString The URL string to normalize.
-   * @param scope The given scope.
+   * 
+   * @param urlString
+   *          The URL string to normalize.
+   * @param scope
+   *          The given scope.
    * @return A normalized String, using the given <code>scope</code>
-   * @throws MalformedURLException If the given URL string is malformed.
+   * @throws MalformedURLException
+   *           If the given URL string is malformed.
    */
   public String normalize(String urlString, String scope)
-          throws MalformedURLException {
+      throws MalformedURLException {
     // optionally loop several times, and break if no further changes
     String initialString = urlString;
     for (int k = 0; k < loopCount; k++) {
@@ -287,7 +316,8 @@ public final class URLNormalizers {
           return null;
         urlString = this.normalizers[i].normalize(urlString, scope);
       }
-      if (initialString.equals(urlString)) break;
+      if (initialString.equals(urlString))
+        break;
       initialString = urlString;
     }
     return urlString;

Modified: nutch/trunk/src/java/org/apache/nutch/net/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/package-info.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/package-info.java Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
  * and {@link org.apache.nutch.net.URLNormalizer normalizers}.
  */
 package org.apache.nutch.net;
+

Modified: nutch/trunk/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java Thu Jan 29 05:38:59 2015
@@ -26,15 +26,15 @@ import java.text.ParseException;
 
 /**
  * class to handle HTTP dates.
- *
+ * 
  * Modified from FastHttpDateFormat.java in jakarta-tomcat.
- *
+ * 
  * @author John Xing
  */
 public class HttpDateFormat {
 
-  protected static SimpleDateFormat format = 
-    new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US);
+  protected static SimpleDateFormat format = new SimpleDateFormat(
+      "EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US);
 
   /**
    * HTTP date uses TimeZone GMT
@@ -43,29 +43,29 @@ public class HttpDateFormat {
     format.setTimeZone(TimeZone.getTimeZone("GMT"));
   }
 
-  //HttpDate (long t) {
-  //}
+  // HttpDate (long t) {
+  // }
 
-  //HttpDate (String s) {
-  //}
+  // HttpDate (String s) {
+  // }
 
-//  /**
-//   * Get the current date in HTTP format.
-//   */
-//  public static String getCurrentDate() {
-//
-//    long now = System.currentTimeMillis();
-//    if ((now - currentDateGenerated) > 1000) {
-//        synchronized (format) {
-//            if ((now - currentDateGenerated) > 1000) {
-//                currentDateGenerated = now;
-//                currentDate = format.format(new Date(now));
-//            }
-//        }
-//    }
-//    return currentDate;
-//
-//  }
+  // /**
+  // * Get the current date in HTTP format.
+  // */
+  // public static String getCurrentDate() {
+  //
+  // long now = System.currentTimeMillis();
+  // if ((now - currentDateGenerated) > 1000) {
+  // synchronized (format) {
+  // if ((now - currentDateGenerated) > 1000) {
+  // currentDateGenerated = now;
+  // currentDate = format.format(new Date(now));
+  // }
+  // }
+  // }
+  // return currentDate;
+  //
+  // }
 
   /**
    * Get the HTTP format of the specified date.

Modified: nutch/trunk/src/java/org/apache/nutch/net/protocols/ProtocolException.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/protocols/ProtocolException.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/protocols/ProtocolException.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/protocols/ProtocolException.java Thu Jan 29 05:38:59 2015
@@ -21,13 +21,13 @@ import java.io.Serializable;
 
 /**
  * Base exception for all protocol handlers
+ * 
  * @deprecated Use {@link org.apache.nutch.protocol.ProtocolException} instead.
  */
 @Deprecated
 @SuppressWarnings("serial")
 public class ProtocolException extends Exception implements Serializable {
 
-
   public ProtocolException() {
     super();
   }

Modified: nutch/trunk/src/java/org/apache/nutch/net/protocols/Response.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/protocols/Response.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/protocols/Response.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/protocols/Response.java Thu Jan 29 05:38:59 2015
@@ -23,12 +23,11 @@ import java.net.URL;
 import org.apache.nutch.metadata.HttpHeaders;
 import org.apache.nutch.metadata.Metadata;
 
-
 /**
- * A response interface.  Makes all protocols model HTTP.
+ * A response interface. Makes all protocols model HTTP.
  */
 public interface Response extends HttpHeaders {
-  
+
   /** Returns the URL used to retrieve this response. */
   public URL getUrl();
 
@@ -40,7 +39,7 @@ public interface Response extends HttpHe
 
   /** Returns all the headers. */
   public Metadata getHeaders();
-  
+
   /** Returns the full content of the response. */
   public byte[] getContent();
 

Modified: nutch/trunk/src/java/org/apache/nutch/net/protocols/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/protocols/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/protocols/package-info.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/protocols/package-info.java Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
  * interface, sea also {@link org.apache.nutch.protocol}.
  */
 package org.apache.nutch.net.protocols;
+

Modified: nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java Thu Jan 29 05:38:59 2015
@@ -24,8 +24,8 @@ import java.util.Properties;
 import org.apache.nutch.metadata.Metadata;
 
 /**
- * This class holds the information about HTML "meta" tags extracted from 
- * a page. Some special tags have convenience methods for easy checking.
+ * This class holds the information about HTML "meta" tags extracted from a
+ * page. Some special tags have convenience methods for easy checking.
  */
 public class HTMLMetaTags {
   private boolean noIndex = false;
@@ -45,7 +45,7 @@ public class HTMLMetaTags {
   private Metadata generalTags = new Metadata();
 
   private Properties httpEquivTags = new Properties();
-  
+
   /**
    * Sets all boolean values to <code>false</code>. Clears all other tags.
    */
@@ -156,8 +156,8 @@ public class HTMLMetaTags {
   }
 
   /**
-   * A convenience method. Returns the current value of <code>refreshTime</code>.
-   * The value may be invalid if {@link #getRefresh()}returns
+   * A convenience method. Returns the current value of <code>refreshTime</code>
+   * . The value may be invalid if {@link #getRefresh()}returns
    * <code>false</code>.
    */
   public int getRefreshTime() {
@@ -179,16 +179,12 @@ public class HTMLMetaTags {
   public Properties getHttpEquivTags() {
     return httpEquivTags;
   }
-  
+
   public String toString() {
     StringBuffer sb = new StringBuffer();
-    sb.append("base=" + baseHref
-            + ", noCache=" + noCache
-            + ", noFollow=" + noFollow
-            + ", noIndex=" + noIndex
-            + ", refresh=" + refresh
-            + ", refreshHref=" + refreshHref + "\n"
-            );
+    sb.append("base=" + baseHref + ", noCache=" + noCache + ", noFollow="
+        + noFollow + ", noIndex=" + noIndex + ", refresh=" + refresh
+        + ", refreshHref=" + refreshHref + "\n");
     sb.append(" * general tags:\n");
     String[] names = generalTags.names();
     for (String name : names) {
@@ -199,7 +195,7 @@ public class HTMLMetaTags {
     Iterator<Object> it = httpEquivTags.keySet().iterator();
     it = httpEquivTags.keySet().iterator();
     while (it.hasNext()) {
-      String key = (String)it.next();
+      String key = (String) it.next();
       sb.append("   - " + key + "\t=\t" + httpEquivTags.get(key) + "\n");
     }
     return sb.toString();

Modified: nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java Thu Jan 29 05:38:59 2015
@@ -27,16 +27,19 @@ import org.apache.hadoop.conf.Configurab
 import org.apache.nutch.plugin.Pluggable;
 import org.apache.nutch.protocol.Content;
 
-
-/** Extension point for DOM-based HTML parsers.  Permits one to add additional
- * metadata to HTML parses.  All plugins found which implement this extension
+/**
+ * Extension point for DOM-based HTML parsers. Permits one to add additional
+ * metadata to HTML parses. All plugins found which implement this extension
  * point are run sequentially on the parse.
  */
 public interface HtmlParseFilter extends Pluggable, Configurable {
   /** The name of the extension point. */
   final static String X_POINT_ID = HtmlParseFilter.class.getName();
 
-  /** Adds metadata or otherwise modifies a parse of HTML content, given
-   * the DOM tree of a page. */
-  ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc);
+  /**
+   * Adds metadata or otherwise modifies a parse of HTML content, given the DOM
+   * tree of a page.
+   */
+  ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc);
 }

Modified: nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java Thu Jan 29 05:38:59 2015
@@ -23,11 +23,11 @@ import org.apache.hadoop.conf.Configurat
 
 import org.w3c.dom.DocumentFragment;
 
-/** Creates and caches {@link HtmlParseFilter} implementing plugins.*/
+/** Creates and caches {@link HtmlParseFilter} implementing plugins. */
 public class HtmlParseFilters {
 
   private HtmlParseFilter[] htmlParseFilters;
-  
+
   public static final String HTMLPARSEFILTER_ORDER = "htmlparsefilter.order";
 
   public HtmlParseFilters(Configuration conf) {
@@ -37,13 +37,14 @@ public class HtmlParseFilters {
   }
 
   /** Run all defined filters. */
-  public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
 
     // loop on each filter
-    for (int i = 0 ; i < this.htmlParseFilters.length; i++) {
+    for (int i = 0; i < this.htmlParseFilters.length; i++) {
       // call filter interface
-      parseResult =
-        htmlParseFilters[i].filter(content, parseResult, metaTags, doc);
+      parseResult = htmlParseFilters[i].filter(content, parseResult, metaTags,
+          doc);
 
       // any failure on parse obj, return
       if (!parseResult.isSuccess()) {

Modified: nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java Thu Jan 29 05:38:59 2015
@@ -30,103 +30,102 @@ import org.apache.hadoop.io.Writable;
 /* An outgoing link from a page. */
 public class Outlink implements Writable {
 
-    private String toUrl;
-    private String anchor;
-    private MapWritable md;
-
-    public Outlink() {
-    }
-
-    public Outlink(String toUrl, String anchor) throws MalformedURLException {
-        this.toUrl = toUrl;
-        if (anchor == null)
-            anchor = "";
-        this.anchor = anchor;
-        md = null;
-    }
-
-    public void readFields(DataInput in) throws IOException {
-        toUrl = Text.readString(in);
-        anchor = Text.readString(in);
-        boolean hasMD = in.readBoolean();
-        if (hasMD) {
-            md = new org.apache.hadoop.io.MapWritable();
-            md.readFields(in);
-        } else
-            md = null;
-    }
-
-    /** Skips over one Outlink in the input. */
-    public static void skip(DataInput in) throws IOException {
-        Text.skip(in); // skip toUrl
-        Text.skip(in); // skip anchor
-        boolean hasMD = in.readBoolean();
-        if (hasMD) {
-            MapWritable metadata = new org.apache.hadoop.io.MapWritable();
-            metadata.readFields(in);
-            ;
-        }
-    }
-
-    public void write(DataOutput out) throws IOException {
-        Text.writeString(out, toUrl);
-        Text.writeString(out, anchor);
-        if (md != null && md.size() > 0) {
-            out.writeBoolean(true);
-            md.write(out);
-        } else {
-            out.writeBoolean(false);
-        }
-    }
-
-    public static Outlink read(DataInput in) throws IOException {
-        Outlink outlink = new Outlink();
-        outlink.readFields(in);
-        return outlink;
-    }
-
-    public String getToUrl() {
-        return toUrl;
-    }
-
-    public void setUrl(String toUrl) {
-        this.toUrl = toUrl;
-    }
-
-    public String getAnchor() {
-        return anchor;
-    }
-
-    public MapWritable getMetadata() {
-        return md;
-    }
-
-    public void setMetadata(MapWritable md) {
-        this.md = md;
-    }
-
-    public boolean equals(Object o) {
-        if (!(o instanceof Outlink))
-            return false;
-        Outlink other = (Outlink) o;
-        return this.toUrl.equals(other.toUrl)
-                && this.anchor.equals(other.anchor);
-    }
-
-    public String toString() {
-        StringBuffer repr = new StringBuffer("toUrl: ");
-        repr.append(toUrl);
-        repr.append(" anchor: ");
-        repr.append(anchor);
-        if (md != null && !md.isEmpty()) {
-            for (Entry<Writable, Writable> e : md.entrySet()) {
-                repr.append(" ");
-                repr.append(e.getKey());
-                repr.append(": ");
-                repr.append(e.getValue());
-            }
-        }
-        return repr.toString();
+  private String toUrl;
+  private String anchor;
+  private MapWritable md;
+
+  public Outlink() {
+  }
+
+  public Outlink(String toUrl, String anchor) throws MalformedURLException {
+    this.toUrl = toUrl;
+    if (anchor == null)
+      anchor = "";
+    this.anchor = anchor;
+    md = null;
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    toUrl = Text.readString(in);
+    anchor = Text.readString(in);
+    boolean hasMD = in.readBoolean();
+    if (hasMD) {
+      md = new org.apache.hadoop.io.MapWritable();
+      md.readFields(in);
+    } else
+      md = null;
+  }
+
+  /** Skips over one Outlink in the input. */
+  public static void skip(DataInput in) throws IOException {
+    Text.skip(in); // skip toUrl
+    Text.skip(in); // skip anchor
+    boolean hasMD = in.readBoolean();
+    if (hasMD) {
+      MapWritable metadata = new org.apache.hadoop.io.MapWritable();
+      metadata.readFields(in);
+      ;
+    }
+  }
+
+  public void write(DataOutput out) throws IOException {
+    Text.writeString(out, toUrl);
+    Text.writeString(out, anchor);
+    if (md != null && md.size() > 0) {
+      out.writeBoolean(true);
+      md.write(out);
+    } else {
+      out.writeBoolean(false);
+    }
+  }
+
+  public static Outlink read(DataInput in) throws IOException {
+    Outlink outlink = new Outlink();
+    outlink.readFields(in);
+    return outlink;
+  }
+
+  public String getToUrl() {
+    return toUrl;
+  }
+
+  public void setUrl(String toUrl) {
+    this.toUrl = toUrl;
+  }
+
+  public String getAnchor() {
+    return anchor;
+  }
+
+  public MapWritable getMetadata() {
+    return md;
+  }
+
+  public void setMetadata(MapWritable md) {
+    this.md = md;
+  }
+
+  public boolean equals(Object o) {
+    if (!(o instanceof Outlink))
+      return false;
+    Outlink other = (Outlink) o;
+    return this.toUrl.equals(other.toUrl) && this.anchor.equals(other.anchor);
+  }
+
+  public String toString() {
+    StringBuffer repr = new StringBuffer("toUrl: ");
+    repr.append(toUrl);
+    repr.append(" anchor: ");
+    repr.append(anchor);
+    if (md != null && !md.isEmpty()) {
+      for (Entry<Writable, Writable> e : md.entrySet()) {
+        repr.append(" ");
+        repr.append(e.getKey());
+        repr.append(": ");
+        repr.append(e.getValue());
+      }
     }
+    return repr.toString();
+  }
 
 }

Modified: nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java Thu Jan 29 05:38:59 2015
@@ -34,8 +34,8 @@ import org.apache.oro.text.regex.Perl5Co
 import org.apache.oro.text.regex.Perl5Matcher;
 
 /**
- * Extractor to extract {@link org.apache.nutch.parse.Outlink}s 
- * / URLs from plain text using Regular Expressions.
+ * Extractor to extract {@link org.apache.nutch.parse.Outlink}s / URLs from
+ * plain text using Regular Expressions.
  * 
  * @see <a
  *      href="http://wiki.java.net/bin/view/Javapedia/RegularExpressions">Comparison
@@ -48,23 +48,26 @@ import org.apache.oro.text.regex.Perl5Ma
  * @since 0.7
  */
 public class OutlinkExtractor {
-  private static final Logger LOG = LoggerFactory.getLogger(OutlinkExtractor.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(OutlinkExtractor.class);
 
   /**
    * Regex pattern to get URLs within a plain text.
    * 
    * @see <a
    *      href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html
+
    *      </a>
    */
-  private static final String URL_PATTERN = 
-    "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
+  private static final String URL_PATTERN = "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
 
   /**
-   * Extracts <code>Outlink</code> from given plain text.
-   * Applying this method to non-plain-text can result in extremely lengthy
-   * runtimes for parasitic cases (postscript is a known example).
-   * @param plainText  the plain text from wich URLs should be extracted.
+   * Extracts <code>Outlink</code> from given plain text. Applying this method
+   * to non-plain-text can result in extremely lengthy runtimes for parasitic
+   * cases (postscript is a known example).
+   * 
+   * @param plainText
+   *          the plain text from wich URLs should be extracted.
    * 
    * @return Array of <code>Outlink</code>s within found in plainText
    */
@@ -73,15 +76,18 @@ public class OutlinkExtractor {
   }
 
   /**
-   * Extracts <code>Outlink</code> from given plain text and adds anchor
-   * to the extracted <code>Outlink</code>s
+   * Extracts <code>Outlink</code> from given plain text and adds anchor to the
+   * extracted <code>Outlink</code>s
    * 
-   * @param plainText the plain text from wich URLs should be extracted.
-   * @param anchor    the anchor of the url
+   * @param plainText
+   *          the plain text from wich URLs should be extracted.
+   * @param anchor
+   *          the anchor of the url
    * 
    * @return Array of <code>Outlink</code>s within found in plainText
    */
-  public static Outlink[] getOutlinks(final String plainText, String anchor, Configuration conf) {
+  public static Outlink[] getOutlinks(final String plainText, String anchor,
+      Configuration conf) {
     long start = System.currentTimeMillis();
     final List<Outlink> outlinks = new ArrayList<Outlink>();
 
@@ -97,11 +103,11 @@ public class OutlinkExtractor {
       MatchResult result;
       String url;
 
-      //loop the matches
+      // loop the matches
       while (matcher.contains(input, pattern)) {
         // if this is taking too long, stop matching
-        //   (SHOULD really check cpu time used so that heavily loaded systems
-        //   do not unnecessarily hit this limit.)
+        // (SHOULD really check cpu time used so that heavily loaded systems
+        // do not unnecessarily hit this limit.)
         if (System.currentTimeMillis() - start >= 60000L) {
           if (LOG.isWarnEnabled()) {
             LOG.warn("Time limit exceeded for getOutLinks");
@@ -117,13 +123,16 @@ public class OutlinkExtractor {
         }
       }
     } catch (Exception ex) {
-      // if the matcher fails (perhaps a malformed URL) we just log it and move on
-      if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
+      // if the matcher fails (perhaps a malformed URL) we just log it and move
+      // on
+      if (LOG.isErrorEnabled()) {
+        LOG.error("getOutlinks", ex);
+      }
     }
 
     final Outlink[] retval;
 
-    //create array of the Outlinks
+    // create array of the Outlinks
     if (outlinks != null && outlinks.size() > 0) {
       retval = outlinks.toArray(new Outlink[0]);
     } else {
@@ -132,7 +141,6 @@ public class OutlinkExtractor {
 
     return retval;
   }
-  
 
   /**
    * Extracts outlinks from a plain text. <br />
@@ -162,7 +170,7 @@ public class OutlinkExtractor {
     // url = re.getParen(0);
     //
     // if (LOG.isTraceEnabled()) {
-    //   LOG.trace("Extracted url: " + url);
+    // LOG.trace("Extracted url: " + url);
     // }
     //
     // try {
@@ -192,9 +200,8 @@ public class OutlinkExtractor {
   }
 
   /**
-   * Extracts outlinks from a plain text.
-   * </p>
-   * This Method takes the JDK5 Regexp API.
+   * Extracts outlinks from a plain text. </p> This Method takes the JDK5 Regexp
+   * API.
    * 
    * @param plainText
    * 
@@ -243,5 +250,5 @@ public class OutlinkExtractor {
     //
     // return retval;
   }
- 
+
 }

Modified: nutch/trunk/src/java/org/apache/nutch/parse/Parse.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/Parse.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/Parse.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/Parse.java Thu Jan 29 05:38:59 2015
@@ -17,18 +17,22 @@
 
 package org.apache.nutch.parse;
 
-/** The result of parsing a page's raw content.
+/**
+ * The result of parsing a page's raw content.
+ * 
  * @see Parser#getParse(Content)
  */
 public interface Parse {
-  
-  /** The textual content of the page. This is indexed, searched, and used when
-   * generating snippets.*/ 
+
+  /**
+   * The textual content of the page. This is indexed, searched, and used when
+   * generating snippets.
+   */
   String getText();
 
   /** Other data extracted from the page. */
   ParseData getData();
-  
+
   /** Indicates if the parse is coming from a url or a sub-url */
   boolean isCanonical();
 }

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseCallable.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseCallable.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseCallable.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseCallable.java Thu Jan 29 05:38:59 2015
@@ -24,7 +24,7 @@ import org.apache.nutch.protocol.Content
 class ParseCallable implements Callable<ParseResult> {
   private Parser p;
   private Content content;
-  
+
   public ParseCallable(Parser p, Content content) {
     this.p = p;
     this.content = content;
@@ -33,5 +33,5 @@ class ParseCallable implements Callable<
   @Override
   public ParseResult call() throws Exception {
     return p.getParse(content);
-  }    
+  }
 }
\ No newline at end of file

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java Thu Jan 29 05:38:59 2015
@@ -30,8 +30,9 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.util.NutchConfiguration;
 
-
-/** Data extracted from a page's content.
+/**
+ * Data extracted from a page's content.
+ * 
  * @see Parse#getData()
  */
 public final class ParseData extends VersionedWritable {
@@ -45,19 +46,19 @@ public final class ParseData extends Ver
   private Metadata parseMeta;
   private ParseStatus status;
   private byte version = VERSION;
-  
+
   public ParseData() {
     contentMeta = new Metadata();
     parseMeta = new Metadata();
   }
 
   public ParseData(ParseStatus status, String title, Outlink[] outlinks,
-                   Metadata contentMeta) {
+      Metadata contentMeta) {
     this(status, title, outlinks, contentMeta, new Metadata());
   }
-  
+
   public ParseData(ParseStatus status, String title, Outlink[] outlinks,
-                   Metadata contentMeta, Metadata parseMeta) {
+      Metadata contentMeta, Metadata parseMeta) {
     this.status = status;
     this.title = title;
     this.outlinks = outlinks;
@@ -70,25 +71,34 @@ public final class ParseData extends Ver
   //
 
   /** The status of parsing the page. */
-  public ParseStatus getStatus() { return status; }
-  
+  public ParseStatus getStatus() {
+    return status;
+  }
+
   /** The title of the page. */
-  public String getTitle() { return title; }
+  public String getTitle() {
+    return title;
+  }
 
   /** The outlinks of the page. */
-  public Outlink[] getOutlinks() { return outlinks; }
+  public Outlink[] getOutlinks() {
+    return outlinks;
+  }
 
   /** The original Metadata retrieved from content */
-  public Metadata getContentMeta() { return contentMeta; }
+  public Metadata getContentMeta() {
+    return contentMeta;
+  }
 
   /**
-   * Other content properties.
-   * This is the place to find format-specific properties.
-   * Different parser implementations for different content types will populate
-   * this differently.
+   * Other content properties. This is the place to find format-specific
+   * properties. Different parser implementations for different content types
+   * will populate this differently.
    */
-  public Metadata getParseMeta() { return parseMeta; }
-  
+  public Metadata getParseMeta() {
+    return parseMeta;
+  }
+
   public void setParseMeta(Metadata parseMeta) {
     this.parseMeta = parseMeta;
   }
@@ -96,11 +106,12 @@ public final class ParseData extends Ver
   public void setOutlinks(Outlink[] outlinks) {
     this.outlinks = outlinks;
   }
-  
+
   /**
-   * Get a metadata single value.
-   * This method first looks for the metadata value in the parse metadata. If no
-   * value is found it the looks for the metadata in the content metadata.
+   * Get a metadata single value. This method first looks for the metadata value
+   * in the parse metadata. If no value is found it the looks for the metadata
+   * in the content metadata.
+   * 
    * @see #getContentMeta()
    * @see #getParseMeta()
    */
@@ -111,12 +122,14 @@ public final class ParseData extends Ver
     }
     return value;
   }
-  
+
   //
   // Writable methods
   //
 
-  public byte getVersion() { return version; }
+  public byte getVersion() {
+    return version;
+  }
 
   public final void readFields(DataInput in) throws IOException {
 
@@ -125,16 +138,16 @@ public final class ParseData extends Ver
     if (version != VERSION)
       throw new VersionMismatchException(VERSION, version);
     status = ParseStatus.read(in);
-    title = Text.readString(in);                   // read title
+    title = Text.readString(in); // read title
 
-    int numOutlinks = in.readInt();    
+    int numOutlinks = in.readInt();
     outlinks = new Outlink[numOutlinks];
     for (int i = 0; i < numOutlinks; i++) {
       outlinks[i] = Outlink.read(in);
     }
-    
+
     if (version < 3) {
-      int propertyCount = in.readInt();             // read metadata
+      int propertyCount = in.readInt(); // read metadata
       contentMeta.clear();
       for (int i = 0; i < propertyCount; i++) {
         contentMeta.add(Text.readString(in), Text.readString(in));
@@ -150,15 +163,15 @@ public final class ParseData extends Ver
   }
 
   public final void write(DataOutput out) throws IOException {
-    out.writeByte(VERSION);                       // write version
-    status.write(out);                            // write status
-    Text.writeString(out, title);                 // write title
+    out.writeByte(VERSION); // write version
+    status.write(out); // write status
+    Text.writeString(out, title); // write title
 
-    out.writeInt(outlinks.length);                // write outlinks
+    out.writeInt(outlinks.length); // write outlinks
     for (int i = 0; i < outlinks.length; i++) {
       outlinks[i].write(out);
     }
-    contentMeta.write(out);                      // write content metadata
+    contentMeta.write(out); // write content metadata
     parseMeta.write(out);
   }
 
@@ -175,38 +188,36 @@ public final class ParseData extends Ver
   public boolean equals(Object o) {
     if (!(o instanceof ParseData))
       return false;
-    ParseData other = (ParseData)o;
-    return
-      this.status.equals(other.status) &&
-      this.title.equals(other.title) &&
-      Arrays.equals(this.outlinks, other.outlinks) &&
-      this.contentMeta.equals(other.contentMeta) &&
-      this.parseMeta.equals(other.parseMeta);
+    ParseData other = (ParseData) o;
+    return this.status.equals(other.status) && this.title.equals(other.title)
+        && Arrays.equals(this.outlinks, other.outlinks)
+        && this.contentMeta.equals(other.contentMeta)
+        && this.parseMeta.equals(other.parseMeta);
   }
 
   public String toString() {
     StringBuffer buffer = new StringBuffer();
 
-    buffer.append("Version: " + version + "\n" );
-    buffer.append("Status: " + status + "\n" );
-    buffer.append("Title: " + title + "\n" );
+    buffer.append("Version: " + version + "\n");
+    buffer.append("Status: " + status + "\n");
+    buffer.append("Title: " + title + "\n");
 
     if (outlinks != null) {
-      buffer.append("Outlinks: " + outlinks.length + "\n" );
+      buffer.append("Outlinks: " + outlinks.length + "\n");
       for (int i = 0; i < outlinks.length; i++) {
         buffer.append("  outlink: " + outlinks[i] + "\n");
       }
     }
 
-    buffer.append("Content Metadata: " + contentMeta + "\n" );
-    buffer.append("Parse Metadata: " + parseMeta + "\n" );
+    buffer.append("Content Metadata: " + contentMeta + "\n");
+    buffer.append("Parse Metadata: " + parseMeta + "\n");
 
     return buffer.toString();
   }
 
   public static void main(String argv[]) throws Exception {
     String usage = "ParseData (-local | -dfs <namenode:port>) recno segment";
-    
+
     if (argv.length < 3) {
       System.out.println("usage:" + usage);
       return;
@@ -214,13 +225,12 @@ public final class ParseData extends Ver
 
     Options opts = new Options();
     Configuration conf = NutchConfiguration.create();
-    
-    GenericOptionsParser parser =
-      new GenericOptionsParser(conf, opts, argv);
-    
+
+    GenericOptionsParser parser = new GenericOptionsParser(conf, opts, argv);
+
     String[] remainingArgs = parser.getRemainingArgs();
     FileSystem fs = FileSystem.get(conf);
-    
+
     try {
       int recno = Integer.parseInt(remainingArgs[0]);
       String segment = remainingArgs[1];

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java Thu Jan 29 05:38:59 2015
@@ -20,8 +20,9 @@ package org.apache.nutch.parse;
 import java.io.*;
 import org.apache.hadoop.io.*;
 
-
-/** The result of parsing a page's raw content.
+/**
+ * The result of parsing a page's raw content.
+ * 
  * @see Parser#getParse(Content)
  */
 public class ParseImpl implements Parse, Writable {
@@ -29,7 +30,8 @@ public class ParseImpl implements Parse,
   private ParseData data;
   private boolean isCanonical;
 
-  public ParseImpl() {}
+  public ParseImpl() {
+  }
 
   public ParseImpl(Parse parse) {
     this(new ParseText(parse.getText()), parse.getData(), true);
@@ -38,7 +40,7 @@ public class ParseImpl implements Parse,
   public ParseImpl(String text, ParseData data) {
     this(new ParseText(text), data, true);
   }
-  
+
   public ParseImpl(ParseText text, ParseData data) {
     this(text, data, true);
   }
@@ -49,12 +51,18 @@ public class ParseImpl implements Parse,
     this.isCanonical = isCanonical;
   }
 
-  public String getText() { return text.getText(); }
+  public String getText() {
+    return text.getText();
+  }
 
-  public ParseData getData() { return data; }
+  public ParseData getData() {
+    return data;
+  }
+
+  public boolean isCanonical() {
+    return isCanonical;
+  }
 
-  public boolean isCanonical() { return isCanonical; }
-  
   public final void write(DataOutput out) throws IOException {
     out.writeBoolean(isCanonical);
     text.write(out);