You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/29 06:39:03 UTC
svn commit: r1655526 [13/26] - in /nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/p...

Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java Thu Jan 29 05:38:59 2015
@@ -18,29 +18,29 @@
 package org.apache.nutch.util.domain;
 
 /**
- * This class represents the last part of the host name, 
- * which is operated by authoritives, not individuals. This information 
- * is needed to find the domain name of a host. The domain name of a host
- * is defined to be the last part before the domain suffix, w/o subdomain 
- * names.  As an example the domain name of <br><code> http://lucene.apache.org/ 
- * </code><br> is <code> apache.org</code>   
- * <br>
- * This class holds three fields,  
- * <strong>domain</strong> field represents the suffix (such as "co.uk")
- * <strong>boost</strong> is a float for boosting score of url's with this suffix
- * <strong>status</strong> field represents domain's status
+ * This class represents the last part of the host name, which is operated by
+ * authoritives, not individuals. This information is needed to find the domain
+ * name of a host. The domain name of a host is defined to be the last part
+ * before the domain suffix, w/o subdomain names. As an example the domain name
+ * of <br>
+ * <code> http://lucene.apache.org/ 
+ * </code><br>
+ * is <code> apache.org</code> <br>
+ * This class holds three fields, <strong>domain</strong> field represents the
+ * suffix (such as "co.uk") <strong>boost</strong> is a float for boosting score
+ * of url's with this suffix <strong>status</strong> field represents domain's
+ * status
  * 
  * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
- * @see TopLevelDomain
- * for info please see conf/domain-suffixes.xml
+ * @see TopLevelDomain for info please see conf/domain-suffixes.xml
  */
 public class DomainSuffix {
 
   /**
-   * Enumeration of the status of the tld. Please see domain-suffixes.xml. 
+   * Enumeration of the status of the tld. Please see domain-suffixes.xml.
    */
-  public enum Status { INFRASTRUCTURE, SPONSORED, UNSPONSORED
-    , STARTUP, PROPOSED, DELETED, PSEUDO_DOMAIN, DEPRECATED, IN_USE, NOT_IN_USE, REJECTED
+  public enum Status {
+    INFRASTRUCTURE, SPONSORED, UNSPONSORED, STARTUP, PROPOSED, DELETED, PSEUDO_DOMAIN, DEPRECATED, IN_USE, NOT_IN_USE, REJECTED
   };
 
   private String domain;
@@ -49,7 +49,7 @@ public class DomainSuffix {
 
   public static final float DEFAULT_BOOST = 1.0f;
   public static final Status DEFAULT_STATUS = Status.IN_USE;
-  
+
   public DomainSuffix(String domain, Status status, float boost) {
     this.domain = domain;
     this.status = status;
@@ -59,7 +59,7 @@ public class DomainSuffix {
   public DomainSuffix(String domain) {
     this(domain, DEFAULT_STATUS, DEFAULT_BOOST);
   }
-  
+
   public String getDomain() {
     return domain;
   }
@@ -71,7 +71,7 @@ public class DomainSuffix {
   public float getBoost() {
     return boost;
   }
-  
+
   @Override
   public String toString() {
     return domain;

Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java Thu Jan 29 05:38:59 2015
@@ -25,57 +25,62 @@ import org.slf4j.LoggerFactory;
 import org.apache.hadoop.util.StringUtils;
 
 /**
- * Storage class for <code>DomainSuffix</code> objects 
- * Note: this class is singleton
+ * Storage class for <code>DomainSuffix</code> objects Note: this class is
+ * singleton
+ * 
  * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
  */
 public class DomainSuffixes {
-  private static final Logger LOG = LoggerFactory.getLogger(DomainSuffixes.class);
-  
-  private HashMap<String, DomainSuffix> domains = new HashMap<String, DomainSuffix>(); 
-  
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DomainSuffixes.class);
+
+  private HashMap<String, DomainSuffix> domains = new HashMap<String, DomainSuffix>();
+
   private static DomainSuffixes instance;
-  
+
   /** private ctor */
   private DomainSuffixes() {
     String file = "domain-suffixes.xml";
-    InputStream input = this.getClass().getClassLoader().getResourceAsStream(file);
+    InputStream input = this.getClass().getClassLoader()
+        .getResourceAsStream(file);
     try {
       new DomainSuffixesReader().read(this, input);
-    }
-    catch (Exception ex) {
+    } catch (Exception ex) {
       LOG.warn(StringUtils.stringifyException(ex));
     }
   }
-  
+
   /**
    * Singleton instance, lazy instantination
-   * @return returns the domain suffix instance 
+   * 
+   * @return returns the domain suffix instance
    */
   public static DomainSuffixes getInstance() {
-    if(instance == null) {
+    if (instance == null) {
       instance = new DomainSuffixes();
     }
     return instance;
   }
-  
+
   void addDomainSuffix(DomainSuffix tld) {
     domains.put(tld.getDomain(), tld);
   }
 
   /** return whether the extension is a registered domain entry */
   public boolean isDomainSuffix(String extension) {
-    return domains.containsKey(extension); 
+    return domains.containsKey(extension);
   }
-    
+
   /**
-   * Return the {@link DomainSuffix} object for the extension, if 
-   * extension is a top level domain returned object will be an 
-   * instance of {@link TopLevelDomain}
-   * @param extension of the domain
+   * Return the {@link DomainSuffix} object for the extension, if extension is a
+   * top level domain returned object will be an instance of
+   * {@link TopLevelDomain}
+   * 
+   * @param extension
+   *          of the domain
    */
   public DomainSuffix get(String extension) {
     return domains.get(extension);
   }
-  
+
 }

Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java Thu Jan 29 05:38:59 2015
@@ -36,16 +36,17 @@ import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
 
 /**
- * For parsing xml files containing domain suffix definitions.
- * Parsed xml files should validate against 
- * <code>domain-suffixes.xsd</code>  
+ * For parsing xml files containing domain suffix definitions. Parsed xml files
+ * should validate against <code>domain-suffixes.xsd</code>
+ * 
  * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
  */
 class DomainSuffixesReader {
 
-  private static final Logger LOG = LoggerFactory.getLogger(DomainSuffixesReader.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DomainSuffixesReader.class);
 
-  void read(DomainSuffixes tldEntries, InputStream input) throws IOException{
+  void read(DomainSuffixes tldEntries, InputStream input) throws IOException {
     try {
 
       DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
@@ -54,28 +55,29 @@ class DomainSuffixesReader {
       Document document = builder.parse(new InputSource(input));
 
       Element root = document.getDocumentElement();
-      
-      if(root != null && root.getTagName().equals("domains")) {
-        
-        Element tlds = (Element)root.getElementsByTagName("tlds").item(0);
-        Element suffixes = (Element)root.getElementsByTagName("suffixes").item(0);
-        
-        //read tlds
-        readITLDs(tldEntries, (Element)tlds.getElementsByTagName("itlds").item(0));
-        readGTLDs(tldEntries, (Element)tlds.getElementsByTagName("gtlds").item(0));
-        readCCTLDs(tldEntries, (Element)tlds.getElementsByTagName("cctlds").item(0));
-        
+
+      if (root != null && root.getTagName().equals("domains")) {
+
+        Element tlds = (Element) root.getElementsByTagName("tlds").item(0);
+        Element suffixes = (Element) root.getElementsByTagName("suffixes")
+            .item(0);
+
+        // read tlds
+        readITLDs(tldEntries, (Element) tlds.getElementsByTagName("itlds")
+            .item(0));
+        readGTLDs(tldEntries, (Element) tlds.getElementsByTagName("gtlds")
+            .item(0));
+        readCCTLDs(tldEntries, (Element) tlds.getElementsByTagName("cctlds")
+            .item(0));
+
         readSuffixes(tldEntries, suffixes);
-      }
-      else {
+      } else {
         throw new IOException("xml file is not valid");
       }
-    }
-    catch (ParserConfigurationException ex) {
+    } catch (ParserConfigurationException ex) {
       LOG.warn(StringUtils.stringifyException(ex));
       throw new IOException(ex.getMessage());
-    }
-    catch (SAXException ex) {
+    } catch (SAXException ex) {
       LOG.warn(StringUtils.stringifyException(ex));
       throw new IOException(ex.getMessage());
     }
@@ -83,22 +85,24 @@ class DomainSuffixesReader {
 
   void readITLDs(DomainSuffixes tldEntries, Element el) {
     NodeList children = el.getElementsByTagName("tld");
-    for(int i=0;i<children.getLength();i++) {
-      tldEntries.addDomainSuffix(readGTLD((Element)children.item(i), Type.INFRASTRUCTURE));
+    for (int i = 0; i < children.getLength(); i++) {
+      tldEntries.addDomainSuffix(readGTLD((Element) children.item(i),
+          Type.INFRASTRUCTURE));
     }
   }
-    
+
   void readGTLDs(DomainSuffixes tldEntries, Element el) {
     NodeList children = el.getElementsByTagName("tld");
-    for(int i=0;i<children.getLength();i++) {
-      tldEntries.addDomainSuffix(readGTLD((Element)children.item(i), Type.GENERIC));
+    for (int i = 0; i < children.getLength(); i++) {
+      tldEntries.addDomainSuffix(readGTLD((Element) children.item(i),
+          Type.GENERIC));
     }
   }
 
   void readCCTLDs(DomainSuffixes tldEntries, Element el) throws IOException {
     NodeList children = el.getElementsByTagName("tld");
-    for(int i=0;i<children.getLength();i++) {
-      tldEntries.addDomainSuffix(readCCTLD((Element)children.item(i)));
+    for (int i = 0; i < children.getLength(); i++) {
+      tldEntries.addDomainSuffix(readCCTLD((Element) children.item(i)));
     }
   }
 
@@ -113,39 +117,40 @@ class DomainSuffixesReader {
     String domain = el.getAttribute("domain");
     Status status = readStatus(el);
     float boost = readBoost(el);
-    String countryName = readCountryName(el); 
-    return new TopLevelDomain(domain, status, boost, countryName);  
+    String countryName = readCountryName(el);
+    return new TopLevelDomain(domain, status, boost, countryName);
   }
-  
+
   /** read optional field status */
   Status readStatus(Element el) {
     NodeList list = el.getElementsByTagName("status");
-    if(list == null || list.getLength() == 0)
+    if (list == null || list.getLength() == 0)
       return DomainSuffix.DEFAULT_STATUS;
     return Status.valueOf(list.item(0).getFirstChild().getNodeValue());
   }
-  
+
   /** read optional field boost */
   float readBoost(Element el) {
     NodeList list = el.getElementsByTagName("boost");
-    if(list == null || list.getLength() == 0)
+    if (list == null || list.getLength() == 0)
       return DomainSuffix.DEFAULT_BOOST;
     return Float.parseFloat(list.item(0).getFirstChild().getNodeValue());
   }
-  
-  /** read field countryname 
-    */
+
+  /**
+   * read field countryname
+   */
   String readCountryName(Element el) throws IOException {
     NodeList list = el.getElementsByTagName("country");
-    if(list == null || list.getLength() == 0)
+    if (list == null || list.getLength() == 0)
       throw new IOException("Country name should be given");
     return list.item(0).getNodeValue();
   }
-  
+
   void readSuffixes(DomainSuffixes tldEntries, Element el) {
     NodeList children = el.getElementsByTagName("suffix");
-    for(int i=0;i<children.getLength();i++) {
-      tldEntries.addDomainSuffix(readSuffix((Element)children.item(i)));
+    for (int i = 0; i < children.getLength(); i++) {
+      tldEntries.addDomainSuffix(readSuffix((Element) children.item(i)));
     }
   }
 
@@ -155,5 +160,5 @@ class DomainSuffixesReader {
     float boost = readBoost(el);
     return new DomainSuffix(domain, status, boost);
   }
-  
+
 }

Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java Thu Jan 29 05:38:59 2015
@@ -18,44 +18,50 @@
 package org.apache.nutch.util.domain;
 
 /**
- * (From wikipedia) A top-level domain (TLD) is the last part of an 
- * Internet domain name; that is, the letters which follow the final 
- * dot of any domain name. For example, in the domain name 
- * <code>www.website.com</code>, the top-level domain is <code>com</code>.
- *
+ * (From wikipedia) A top-level domain (TLD) is the last part of an Internet
+ * domain name; that is, the letters which follow the final dot of any domain
+ * name. For example, in the domain name <code>www.website.com</code>, the
+ * top-level domain is <code>com</code>.
+ * 
  * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
  * 
  * @see <a href="http://www.iana.org/"> iana.org</a>
  * 
- * @see <a href="http://en.wikipedia.org/wiki/Top-level_domain"> Top-level_domain</a>
+ * @see <a href="http://en.wikipedia.org/wiki/Top-level_domain">
+ *      Top-level_domain</a>
  */
 public class TopLevelDomain extends DomainSuffix {
 
-  public enum Type { INFRASTRUCTURE, GENERIC, COUNTRY };
-  
+  public enum Type {
+    INFRASTRUCTURE, GENERIC, COUNTRY
+  };
+
   private Type type;
   private String countryName = null;
-  
-  public TopLevelDomain(String domain, Type type, Status status, float boost){
+
+  public TopLevelDomain(String domain, Type type, Status status, float boost) {
     super(domain, status, boost);
     this.type = type;
   }
 
-  public TopLevelDomain(String domain, Status status, float boost, String countryName){
+  public TopLevelDomain(String domain, Status status, float boost,
+      String countryName) {
     super(domain, status, boost);
     this.type = Type.COUNTRY;
     this.countryName = countryName;
   }
-  
+
   public Type getType() {
     return type;
   }
 
-  /** Returns the country name if TLD is Country Code TLD
+  /**
+   * Returns the country name if TLD is Country Code TLD
+   * 
    * @return country name or null
-   */ 
-  public String getCountryName(){
+   */
+  public String getCountryName() {
     return countryName;
   }
-  
+
 }

Modified: nutch/trunk/src/java/org/apache/nutch/util/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/package-info.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/package-info.java Thu Jan 29 05:38:59 2015
@@ -19,3 +19,4 @@
  * Miscellaneous utility classes.
  */
 package org.apache.nutch.util;
+

Modified: nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java Thu Jan 29 05:38:59 2015
@@ -41,16 +41,17 @@ import java.net.MalformedURLException;
 
 /** Adds basic searchable fields to a document. */
 public class CCIndexingFilter implements IndexingFilter {
-  public static final Logger LOG = LoggerFactory.getLogger(CCIndexingFilter.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(CCIndexingFilter.class);
 
   /** The name of the document field we use. */
   public static String FIELD = "cc";
 
   private Configuration conf;
 
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
-    throws IndexingException {
-    
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
     Metadata metadata = parse.getData().getParseMeta();
     // index the license
     String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
@@ -81,9 +82,11 @@ public class CCIndexingFilter implements
     return doc;
   }
 
-  /** Add the features represented by a license URL.  Urls are of the form
+  /**
+   * Add the features represented by a license URL. Urls are of the form
    * "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
-   * license feature. */
+   * license feature.
+   */
   public void addUrlFeatures(NutchDocument doc, String urlString) {
     try {
       URL url = new URL(urlString);
@@ -92,7 +95,7 @@ public class CCIndexingFilter implements
       StringTokenizer names = new StringTokenizer(url.getPath(), "/-");
 
       if (names.hasMoreTokens())
-        names.nextToken();                        // throw away "licenses"
+        names.nextToken(); // throw away "licenses"
 
       // add a feature per component after "licenses"
       while (names.hasMoreTokens()) {
@@ -105,7 +108,7 @@ public class CCIndexingFilter implements
       }
     }
   }
-  
+
   private void addFeature(NutchDocument doc, String feature) {
     doc.add(FIELD, feature);
   }

Modified: nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (original)
+++ nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java Thu Jan 29 05:38:59 2015
@@ -33,27 +33,25 @@ import javax.xml.parsers.*;
 import org.xml.sax.InputSource;
 import org.w3c.dom.*;
 
-
 /** Adds metadata identifying the Creative Commons license used, if any. */
 public class CCParseFilter implements HtmlParseFilter {
   public static final Logger LOG = LoggerFactory.getLogger(CCParseFilter.class);
 
-
-  /** Walks DOM tree, looking for RDF in comments and licenses in anchors.*/
+  /** Walks DOM tree, looking for RDF in comments and licenses in anchors. */
   public static class Walker {
-    private URL base;                             // base url of page
-    private String rdfLicense;                    // subject url found, if any
-    private URL relLicense;                       // license url found, if any
-    private URL anchorLicense;                    // anchor url found, if any
-    private String workType;                      // work type URI
+    private URL base; // base url of page
+    private String rdfLicense; // subject url found, if any
+    private URL relLicense; // license url found, if any
+    private URL anchorLicense; // anchor url found, if any
+    private String workType; // work type URI
 
     private Walker(URL base) {
       this.base = base;
     }
 
-    /** Scan the document adding attributes to metadata.*/
-    public static void walk(Node doc, URL base, Metadata metadata, Configuration conf)
-      throws ParseException {
+    /** Scan the document adding attributes to metadata. */
+    public static void walk(Node doc, URL base, Metadata metadata,
+        Configuration conf) throws ParseException {
 
       // walk the DOM tree, scanning for license data
       Walker walker = new Walker(base);
@@ -62,13 +60,13 @@ public class CCParseFilter implements Ht
       // interpret results of walk
       String licenseUrl = null;
       String licenseLocation = null;
-      if (walker.rdfLicense != null) {            // 1st choice: subject in RDF
+      if (walker.rdfLicense != null) { // 1st choice: subject in RDF
         licenseLocation = "rdf";
         licenseUrl = walker.rdfLicense;
-      } else if (walker.relLicense != null) {     // 2nd: anchor w/ rel=license
+      } else if (walker.relLicense != null) { // 2nd: anchor w/ rel=license
         licenseLocation = "rel";
         licenseUrl = walker.relLicense.toString();
-      } else if (walker.anchorLicense != null) {  // 3rd: anchor w/ CC license
+      } else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC license
         licenseLocation = "a";
         licenseUrl = walker.anchorLicense.toString();
       } else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) {
@@ -78,7 +76,8 @@ public class CCParseFilter implements Ht
       // add license to metadata
       if (licenseUrl != null) {
         if (LOG.isInfoEnabled()) {
-          LOG.info("CC: found "+licenseUrl+" in "+licenseLocation+" of "+base);
+          LOG.info("CC: found " + licenseUrl + " in " + licenseLocation
+              + " of " + base);
         }
         metadata.add(CreativeCommons.LICENSE_URL, licenseUrl);
         metadata.add(CreativeCommons.LICENSE_LOCATION, licenseLocation);
@@ -86,36 +85,38 @@ public class CCParseFilter implements Ht
 
       if (walker.workType != null) {
         if (LOG.isInfoEnabled()) {
-          LOG.info("CC: found "+walker.workType+" in "+base);
+          LOG.info("CC: found " + walker.workType + " in " + base);
         }
         metadata.add(CreativeCommons.WORK_TYPE, walker.workType);
       }
 
     }
 
-    /** Scan the document looking for RDF in comments and license elements.*/
+    /** Scan the document looking for RDF in comments and license elements. */
     private void walk(Node node) {
-      
+
       // check element nodes for license URL
       if (node instanceof Element) {
-        findLicenseUrl((Element)node);
+        findLicenseUrl((Element) node);
       }
 
       // check comment nodes for license RDF
       if (node instanceof Comment) {
-        findRdf(((Comment)node).getData());
+        findRdf(((Comment) node).getData());
       }
 
       // recursively walk child nodes
       NodeList children = node.getChildNodes();
-      for (int i = 0; children != null && i < children.getLength(); i++ ) {
+      for (int i = 0; children != null && i < children.getLength(); i++) {
         walk(children.item(i));
       }
     }
 
-    /** Extract license url from element, if any.  Thse are the href attribute
-     * of anchor elements with rel="license".  These must also point to
-     * http://creativecommons.org/licenses/. */
+    /**
+     * Extract license url from element, if any. Thse are the href attribute of
+     * anchor elements with rel="license". These must also point to
+     * http://creativecommons.org/licenses/.
+     */
     private void findLicenseUrl(Element element) {
       // only look in Anchor elements
       if (!"a".equalsIgnoreCase(element.getTagName()))
@@ -125,54 +126,52 @@ public class CCParseFilter implements Ht
       String href = element.getAttribute("href");
       if (href == null)
         return;
-      
+
       try {
-        URL url = new URL(base, href);            // resolve the url
+        URL url = new URL(base, href); // resolve the url
 
         // check that it's a CC license URL
-        if ("http".equalsIgnoreCase(url.getProtocol()) &&
-            "creativecommons.org".equalsIgnoreCase(url.getHost()) &&
-            url.getPath() != null &&
-            url.getPath().startsWith("/licenses/") &&
-            url.getPath().length() > "/licenses/".length()) {
+        if ("http".equalsIgnoreCase(url.getProtocol())
+            && "creativecommons.org".equalsIgnoreCase(url.getHost())
+            && url.getPath() != null && url.getPath().startsWith("/licenses/")
+            && url.getPath().length() > "/licenses/".length()) {
 
           // check rel="license"
           String rel = element.getAttribute("rel");
           if (rel != null && "license".equals(rel) && this.relLicense == null) {
-            this.relLicense = url;                   // found rel license
+            this.relLicense = url; // found rel license
           } else if (this.anchorLicense == null) {
-            this.anchorLicense = url;             // found anchor license
+            this.anchorLicense = url; // found anchor license
           }
         }
-      } catch (MalformedURLException e) {         // ignore malformed urls
+      } catch (MalformedURLException e) { // ignore malformed urls
       }
     }
 
-   /** Configure a namespace aware XML parser. */
-    private static final DocumentBuilderFactory FACTORY
-      = DocumentBuilderFactory.newInstance();
+    /** Configure a namespace aware XML parser. */
+    private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory
+        .newInstance();
     static {
       FACTORY.setNamespaceAware(true);
     }
 
     /** Creative Commons' namespace URI. */
     private static final String CC_NS = "http://web.resource.org/cc/";
-    
+
     /** Dublin Core namespace URI. */
     private static final String DC_NS = "http://purl.org/dc/elements/1.1/";
-    
+
     /** RDF syntax namespace URI. */
-    private static final String RDF_NS
-      = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+    private static final String RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
 
     private void findRdf(String comment) {
       // first check for likely RDF in comment
       int rdfPosition = comment.indexOf("RDF");
       if (rdfPosition < 0)
-        return;                                   // no RDF, abort
+        return; // no RDF, abort
       int nsPosition = comment.indexOf(CC_NS);
       if (nsPosition < 0)
-        return;                                   // no RDF, abort
+        return; // no RDF, abort
 
       // try to parse the XML
       Document doc;
@@ -181,28 +180,30 @@ public class CCParseFilter implements Ht
         doc = parser.parse(new InputSource(new StringReader(comment)));
       } catch (Exception e) {
         if (LOG.isWarnEnabled()) {
-          LOG.warn("CC: Failed to parse RDF in "+base+": "+e);
+          LOG.warn("CC: Failed to parse RDF in " + base + ": " + e);
         }
-        //e.printStackTrace();
+        // e.printStackTrace();
         return;
       }
 
       // check that root is rdf:RDF
       NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF");
       if (roots.getLength() != 1) {
-        if (LOG.isWarnEnabled()) { LOG.warn("CC: No RDF root in "+base); }
+        if (LOG.isWarnEnabled()) {
+          LOG.warn("CC: No RDF root in " + base);
+        }
         return;
       }
-      Element rdf = (Element)roots.item(0);
+      Element rdf = (Element) roots.item(0);
 
       // get cc:License nodes inside rdf:RDF
       NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License");
       for (int i = 0; i < licenses.getLength(); i++) {
 
-        Element l = (Element)licenses.item(i);
+        Element l = (Element) licenses.item(i);
 
         // license is rdf:about= attribute from cc:License
-        this.rdfLicense = l.getAttributeNodeNS(RDF_NS,"about").getValue();
+        this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about").getValue();
 
         // walk predicates of cc:License
         NodeList predicates = l.getChildNodes();
@@ -210,17 +211,17 @@ public class CCParseFilter implements Ht
           Node predicateNode = predicates.item(j);
           if (!(predicateNode instanceof Element))
             continue;
-          Element predicateElement = (Element)predicateNode;
+          Element predicateElement = (Element) predicateNode;
 
           // extract predicates of cc:xxx predicates
           if (!CC_NS.equals(predicateElement.getNamespaceURI())) {
             continue;
           }
-        
+
           // add object and predicate to metadata
           // metadata.put(object, predicate);
           // if (LOG.isInfoEnabled()) {
-          //   LOG.info("CC: found: "+predicate+"="+object);
+          // LOG.info("CC: found: "+predicate+"="+object);
           // }
         }
       }
@@ -230,10 +231,11 @@ public class CCParseFilter implements Ht
       for (int i = 0; i < works.getLength(); i++) {
         // get dc:type nodes from cc:Work
         NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");
-        
+
         for (int j = 0; j < types.getLength(); j++) {
-          Element type = (Element)types.item(j);
-          String workUri = type.getAttributeNodeNS(RDF_NS, "resource").getValue();
+          Element type = (Element) types.item(j);
+          String workUri = type.getAttributeNodeNS(RDF_NS, "resource")
+              .getValue();
           this.workType = WORK_TYPE_NAMES.get(workUri);
         }
       }
@@ -246,16 +248,20 @@ public class CCParseFilter implements Ht
     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image");
     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio");
     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text");
-    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive", "interactive");
+    WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive",
+        "interactive");
     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software");
     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
   }
 
   private Configuration conf;
 
-  /** Adds metadata or otherwise modifies a parse of an HTML document, given
-   * the DOM tree of a page. */
-  public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
+  /**
+   * Adds metadata or otherwise modifies a parse of an HTML document, given the
+   * DOM tree of a page.
+   */
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
 
     // get parse obj
     Parse parse = parseResult.get(content.getUrl());
@@ -266,9 +272,8 @@ public class CCParseFilter implements Ht
       base = new URL(content.getBaseUrl());
     } catch (MalformedURLException e) {
       Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf());
-      parseResult.put(content.getUrl(), 
-                      new ParseText(emptyParse.getText()), 
-                      emptyParse.getData());
+      parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()),
+          emptyParse.getData());
       return parseResult;
     }
 
@@ -277,9 +282,8 @@ public class CCParseFilter implements Ht
       Walker.walk(doc, base, parse.getData().getParseMeta(), getConf());
     } catch (ParseException e) {
       Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf());
-      parseResult.put(content.getUrl(), 
-                      new ParseText(emptyParse.getText()), 
-                      emptyParse.getData());
+      parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()),
+          emptyParse.getData());
       return parseResult;
     }
 

Modified: nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original)
+++ nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Thu Jan 29 05:38:59 2015
@@ -30,30 +30,28 @@ import java.io.*;
 
 public class TestCCParseFilter {
 
-  private static final File testDir =
-    new File(System.getProperty("test.input"));
+  private static final File testDir = new File(System.getProperty("test.input"));
 
   @Test
   public void testPages() throws Exception {
     pageTest(new File(testDir, "anchor.html"), "http://foo.com/",
-             "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
+        "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
     // Tika returns <a> whereas parse-html returns <rel>
     // check later
     pageTest(new File(testDir, "rel.html"), "http://foo.com/",
-             "http://creativecommons.org/licenses/by-nc/2.0", "rel", null);
+        "http://creativecommons.org/licenses/by-nc/2.0", "rel", null);
     // Tika returns <a> whereas parse-html returns <rdf>
     // check later
     pageTest(new File(testDir, "rdf.html"), "http://foo.com/",
-             "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text");
+        "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text");
   }
 
-  public void pageTest(File file, String url,
-                       String license, String location, String type)
-    throws Exception {
+  public void pageTest(File file, String url, String license, String location,
+      String type) throws Exception {
 
     String contentType = "text/html";
     InputStream in = new FileInputStream(file);
-    ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
+    ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
     byte[] buffer = new byte[1024];
     int i;
     while ((i = in.read(buffer)) != -1) {
@@ -63,14 +61,13 @@ public class TestCCParseFilter {
     byte[] bytes = out.toByteArray();
     Configuration conf = NutchConfiguration.create();
 
-    Content content =
-      new Content(url, url, bytes, contentType, new Metadata(), conf);
-    Parse parse =  new ParseUtil(conf).parse(content).get(content.getUrl());
-    
+    Content content = new Content(url, url, bytes, contentType, new Metadata(),
+        conf);
+    Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+
     Metadata metadata = parse.getData().getParseMeta();
     Assert.assertEquals(license, metadata.get("License-Url"));
     Assert.assertEquals(location, metadata.get("License-Location"));
     Assert.assertEquals(type, metadata.get("Work-Type"));
   }
 }
-

Modified: nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java Thu Jan 29 05:38:59 2015
@@ -38,78 +38,77 @@ import org.apache.nutch.parse.ParseData;
  * @author mattmann
  * @since NUTCH-444
  * 
- * An {@link IndexingFilter} implementation to pull out the
- * relevant extracted {@link Metadata} fields from the RSS feeds
- * and into the index.
- *
+ *        An {@link IndexingFilter} implementation to pull out the relevant
+ *        extracted {@link Metadata} fields from the RSS feeds and into the
+ *        index.
+ * 
  */
 public class FeedIndexingFilter implements IndexingFilter {
-  
+
   public static final String dateFormatStr = "yyyyMMddHHmm";
-  
+
   private Configuration conf;
-  
+
   private final static String PUBLISHED_DATE = "publishedDate";
-  
+
   private final static String UPDATED_DATE = "updatedDate";
-  
+
   /**
    * Extracts out the relevant fields:
    * 
    * <ul>
-   *  <li>FEED_AUTHOR</li>
-   *  <li>FEED_TAGS</li>
-   *  <li>FEED_PUBLISHED</li>
-   *  <li>FEED_UPDATED</li>
-   *  <li>FEED</li>
+   * <li>FEED_AUTHOR</li>
+   * <li>FEED_TAGS</li>
+   * <li>FEED_PUBLISHED</li>
+   * <li>FEED_UPDATED</li>
+   * <li>FEED</li>
    * </ul>
    * 
-   * And sends them to the {@link Indexer} for indexing within the Nutch
-   * index.
-   *  
+   * And sends them to the {@link Indexer} for indexing within the Nutch index.
+   * 
    */
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
-                         Inlinks inlinks) throws IndexingException {
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
     ParseData parseData = parse.getData();
     Metadata parseMeta = parseData.getParseMeta();
-    
+
     String[] authors = parseMeta.getValues(Feed.FEED_AUTHOR);
     String[] tags = parseMeta.getValues(Feed.FEED_TAGS);
     String published = parseMeta.get(Feed.FEED_PUBLISHED);
     String updated = parseMeta.get(Feed.FEED_UPDATED);
     String feed = parseMeta.get(Feed.FEED);
-    
+
     if (authors != null) {
       for (String author : authors) {
         doc.add(Feed.FEED_AUTHOR, author);
       }
     }
-    
+
     if (tags != null) {
       for (String tag : tags) {
         doc.add(Feed.FEED_TAGS, tag);
       }
     }
-    
+
     if (feed != null)
       doc.add(Feed.FEED, feed);
-    
+
     if (published != null) {
       Date date = new Date(Long.parseLong(published));
       doc.add(PUBLISHED_DATE, date);
     }
-    
+
     if (updated != null) {
       Date date = new Date(Long.parseLong(updated));
       doc.add(UPDATED_DATE, date);
     }
-        
+
     return doc;
   }
 
   /**
-   * @return the {@link Configuration} object used to configure
-   * this {@link IndexingFilter}.
+   * @return the {@link Configuration} object used to configure this
+   *         {@link IndexingFilter}.
    */
   public Configuration getConf() {
     return conf;
@@ -119,8 +118,9 @@ public class FeedIndexingFilter implemen
    * Sets the {@link Configuration} object used to configure this
    * {@link IndexingFilter}.
    * 
-   * @param conf The {@link Configuration} object used to configure
-   * this {@link IndexingFilter}.
+   * @param conf
+   *          The {@link Configuration} object used to configure this
+   *          {@link IndexingFilter}.
    */
   public void setConf(Configuration conf) {
     this.conf = conf;

Modified: nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java (original)
+++ nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java Thu Jan 29 05:38:59 2015
@@ -19,3 +19,4 @@
  * Indexing filter to index meta data from RSS feeds.
  */
 package org.apache.nutch.indexer.feed;
+

Modified: nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java (original)
+++ nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java Thu Jan 29 05:38:59 2015
@@ -66,10 +66,10 @@ import com.sun.syndication.io.SyndFeedIn
  * @author mattmann
  * @since NUTCH-444
  * 
- * <p>
- * A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced links
- * and content present in the feed.
- * </p>
+ *        <p>
+ *        A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced
+ *        links and content present in the feed.
+ *        </p>
  * 
  */
 public class FeedParser implements Parser {
@@ -99,8 +99,8 @@ public class FeedParser implements Parse
    *          A {@link Content} object representing the feed that is being
    *          parsed by this {@link Parser}.
    * 
-   * @return A {@link ParseResult} containing all {@link Parse}d feeds that
-   *         were present in the feed file that this {@link Parser} dealt with.
+   * @return A {@link ParseResult} containing all {@link Parse}d feeds that were
+   *         present in the feed file that this {@link Parser} dealt with.
    * 
    */
   public ParseResult getParse(Content content) {
@@ -111,8 +111,8 @@ public class FeedParser implements Parse
     detector.autoDetectClues(content, true);
     String encoding = detector.guessEncoding(content, defaultEncoding);
     try {
-      InputSource input = new InputSource(new ByteArrayInputStream(content
-          .getContent()));
+      InputSource input = new InputSource(new ByteArrayInputStream(
+          content.getContent()));
       input.setEncoding(encoding);
       SyndFeedInput feedInput = new SyndFeedInput();
       feed = feedInput.build(input);
@@ -134,8 +134,8 @@ public class FeedParser implements Parse
     }
 
     List<?> entries = feed.getEntries();
-    for(Object entry: entries) {
-      addToMap(parseResult, feed, feedLink, (SyndEntry)entry, content);
+    for (Object entry : entries) {
+      addToMap(parseResult, feed, feedLink, (SyndEntry) entry, content);
     }
 
     String feedDesc = stripTags(feed.getDescriptionEx());
@@ -170,8 +170,8 @@ public class FeedParser implements Parse
     this.parserFactory = new ParserFactory(conf);
     this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
     this.filters = new URLFilters(conf);
-    this.defaultEncoding =
-      conf.get("parser.character.encoding.default", "windows-1252");
+    this.defaultEncoding = conf.get("parser.character.encoding.default",
+        "windows-1252");
   }
 
   /**
@@ -255,8 +255,8 @@ public class FeedParser implements Parse
     if (text == null) {
       List<?> contents = entry.getContents();
       StringBuilder buf = new StringBuilder();
-      for (Object syndContent: contents) {
-        buf.append(((SyndContent)syndContent).getValue());
+      for (Object syndContent : contents) {
+        buf.append(((SyndContent) syndContent).getValue());
       }
       text = buf.toString();
     }
@@ -273,9 +273,9 @@ public class FeedParser implements Parse
       ParseData data = parse.getData();
       data.getContentMeta().remove(Response.CONTENT_TYPE);
       mergeMetadata(data.getParseMeta(), parseMeta);
-      parseResult.put(link, new ParseText(parse.getText()), new ParseData(
-          ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), data
-              .getContentMeta(), data.getParseMeta()));
+      parseResult.put(link, new ParseText(parse.getText()),
+          new ParseData(ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(),
+              data.getContentMeta(), data.getParseMeta()));
     } else {
       contentMeta.remove(Response.CONTENT_TYPE);
       parseResult.put(link, new ParseText(text), new ParseData(
@@ -323,7 +323,7 @@ public class FeedParser implements Parse
       }
     }
 
-    for (Object i: categories) {
+    for (Object i : categories) {
       parseMeta.add(Feed.FEED_TAGS, ((SyndCategory) i).getName());
     }
 

Modified: nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java (original)
+++ nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java Thu Jan 29 05:38:59 2015
@@ -19,3 +19,4 @@
  * Parse RSS feeds.
  */
 package org.apache.nutch.parse.feed;
+

Modified: nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java (original)
+++ nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java Thu Jan 29 05:38:59 2015
@@ -43,7 +43,7 @@ import org.apache.nutch.util.NutchConfig
  * 
  * @author mattmann
  * 
- * Test Suite for the {@link FeedParser}.
+ *         Test Suite for the {@link FeedParser}.
  * 
  */
 public class TestFeedParser {
@@ -96,18 +96,17 @@ public class TestFeedParser {
 
       Assert.assertEquals(3, parseResult.size());
 
-      boolean hasLink1 = false, hasLink2 = false, hasLink3=false;
+      boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
 
       for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j
           .hasNext();) {
         Map.Entry<Text, Parse> entry = j.next();
-        if (entry.getKey().toString().equals(
-            "http://www-scf.usc.edu/~mattmann/")) {
+        if (entry.getKey().toString()
+            .equals("http://www-scf.usc.edu/~mattmann/")) {
           hasLink1 = true;
         } else if (entry.getKey().toString().equals("http://www.nutch.org/")) {
           hasLink2 = true;
-        }
-        else if(entry.getKey().toString().equals(urlString)){
+        } else if (entry.getKey().toString().equals(urlString)) {
           hasLink3 = true;
         }
 

Modified: nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java (original)
+++ nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java Thu Jan 29 05:38:59 2015
@@ -38,15 +38,16 @@ public class HeadingsParseFilter impleme
    * Pattern used to strip surpluss whitespace
    */
   protected static Pattern whitespacePattern = Pattern.compile("\\s+");
-    
+
   private Configuration conf;
   private String[] headings;
   private boolean multiValued = false;
 
-  public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
     Parse parse = parseResult.get(content.getUrl());
 
-    for (int i = 0 ; headings != null && i < headings.length ; i++ ) {
+    for (int i = 0; headings != null && i < headings.length; i++) {
       List<String> discoveredHeadings = getElement(doc, headings[i]);
 
       if (discoveredHeadings.size() > 0) {
@@ -89,7 +90,7 @@ public class HeadingsParseFilter impleme
       if (currentNode.getNodeType() == Node.ELEMENT_NODE) {
         if (element.equalsIgnoreCase(currentNode.getNodeName())) {
           headings.add(getNodeValue(currentNode));
-          
+
           // Check for multiValued here, if disabled we don't need
           // to discover more headings.
           if (!multiValued) {

Modified: nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java (original)
+++ nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java Thu Jan 29 05:38:59 2015
@@ -19,3 +19,4 @@
  * Parse filter to extract headings (h1, h2, etc.) from DOM parse tree.
  */
 package org.apache.nutch.parse.headings;
+

Modified: nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java Thu Jan 29 05:38:59 2015
@@ -30,13 +30,15 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 /**
- * Indexing filter that offers an option to either index all inbound anchor text for 
- * a document or deduplicate anchors. Deduplication does have it's con's, 
+ * Indexing filter that offers an option to either index all inbound anchor text
+ * for a document or deduplicate anchors. Deduplication does have it's con's,
+ * 
  * @see {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
  */
 public class AnchorIndexingFilter implements IndexingFilter {
 
-  public static final Logger LOG = LoggerFactory.getLogger(AnchorIndexingFilter.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(AnchorIndexingFilter.class);
   private Configuration conf;
   private boolean deduplicate = false;
 
@@ -49,6 +51,7 @@ public class AnchorIndexingFilter implem
     deduplicate = conf.getBoolean("anchorIndexingFilter.deduplicate", false);
     LOG.info("Anchor deduplication is: " + (deduplicate ? "on" : "off"));
   }
+
   /**
    * Get the {@link Configuration} object
    */
@@ -57,28 +60,33 @@ public class AnchorIndexingFilter implem
   }
 
   /**
-   * The {@link AnchorIndexingFilter} filter object which supports boolean 
-   * configuration settings for the deduplication of anchors. 
-   * See {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
-   *  
-   * @param doc The {@link NutchDocument} object
-   * @param parse The relevant {@link Parse} object passing through the filter 
-   * @param url URL to be filtered for anchor text
-   * @param datum The {@link CrawlDatum} entry
-   * @param inlinks The {@link Inlinks} containing anchor text
+   * The {@link AnchorIndexingFilter} filter object which supports boolean
+   * configuration settings for the deduplication of anchors. See
+   * {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
+   * 
+   * @param doc
+   *          The {@link NutchDocument} object
+   * @param parse
+   *          The relevant {@link Parse} object passing through the filter
+   * @param url
+   *          URL to be filtered for anchor text
+   * @param datum
+   *          The {@link CrawlDatum} entry
+   * @param inlinks
+   *          The {@link Inlinks} containing anchor text
    * @return filtered NutchDocument
    */
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
-    Inlinks inlinks) throws IndexingException {
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
 
-    String[] anchors = (inlinks != null ? inlinks.getAnchors()
-      : new String[0]);
+    String[] anchors = (inlinks != null ? inlinks.getAnchors() : new String[0]);
 
     HashSet<String> set = null;
 
     for (int i = 0; i < anchors.length; i++) {
       if (deduplicate) {
-        if (set == null) set = new HashSet<String>();
+        if (set == null)
+          set = new HashSet<String>();
         String lcAnchor = anchors[i].toLowerCase();
 
         // Check if already processed the current anchor

Modified: nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java Thu Jan 29 05:38:59 2015
@@ -29,12 +29,11 @@ import org.junit.Assert;
 import org.junit.Test;
 
 /**
- * JUnit test case which tests
- * 1. that anchor text is obtained
- * 2. that anchor deduplication functionality is working
+ * JUnit test case which tests 1. that anchor text is obtained 2. that anchor
+ * deduplication functionality is working
  * 
  * @author lewismc
- *
+ * 
  */
 public class TestAnchorIndexingFilter {
 
@@ -52,14 +51,17 @@ public class TestAnchorIndexingFilter {
     inlinks.add(new Inlink("http://test2.com/", "text2"));
     inlinks.add(new Inlink("http://test3.com/", "text2"));
     try {
-      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks);
-    } catch(Exception e){
+      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
+          new CrawlDatum(), inlinks);
+    } catch (Exception e) {
       e.printStackTrace();
       Assert.fail(e.getMessage());
     }
     Assert.assertNotNull(doc);
-    Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
-    Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size());
+    Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames()
+        .contains("anchor"));
+    Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor")
+        .getValues().size());
   }
 
 }

Modified: nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Thu Jan 29 05:38:59 2015
@@ -39,42 +39,48 @@ import java.util.Date;
 
 import org.apache.hadoop.conf.Configuration;
 
-/** 
- * Adds basic searchable fields to a document. 
- * The fields added are : domain, host, url, content, title, cache, tstamp
- * domain is included depending on {@code indexer.add.domain} in nutch-default.xml.
- * title is truncated as per {@code indexer.max.title.length} in nutch-default.xml. 
- *       (As per NUTCH-1004, a zero-length title is not added)
- * content is truncated as per {@code indexer.max.content.length} in nutch-default.xml.
+/**
+ * Adds basic searchable fields to a document. The fields added are : domain,
+ * host, url, content, title, cache, tstamp domain is included depending on
+ * {@code indexer.add.domain} in nutch-default.xml. title is truncated as per
+ * {@code indexer.max.title.length} in nutch-default.xml. (As per NUTCH-1004, a
+ * zero-length title is not added) content is truncated as per
+ * {@code indexer.max.content.length} in nutch-default.xml.
  */
 public class BasicIndexingFilter implements IndexingFilter {
-  public static final Logger LOG = LoggerFactory.getLogger(BasicIndexingFilter.class);
+  public static final Logger LOG = LoggerFactory
+      .getLogger(BasicIndexingFilter.class);
 
   private int MAX_TITLE_LENGTH;
   private int MAX_CONTENT_LENGTH;
   private boolean addDomain = false;
   private Configuration conf;
 
- /**
-  * The {@link BasicIndexingFilter} filter object which supports few 
-  * configuration settings for adding basic searchable fields. 
-  * See {@code indexer.add.domain}, {@code indexer.max.title.length}, 
-  * {@code indexer.max.content.length} in nutch-default.xml.
-  *  
-  * @param doc The {@link NutchDocument} object
-  * @param parse The relevant {@link Parse} object passing through the filter 
-  * @param url URL to be filtered for anchor text
-  * @param datum The {@link CrawlDatum} entry
-  * @param inlinks The {@link Inlinks} containing anchor text
-  * @return filtered NutchDocument
-  */
-  public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
-    throws IndexingException {
+  /**
+   * The {@link BasicIndexingFilter} filter object which supports few
+   * configuration settings for adding basic searchable fields. See
+   * {@code indexer.add.domain}, {@code indexer.max.title.length},
+   * {@code indexer.max.content.length} in nutch-default.xml.
+   * 
+   * @param doc
+   *          The {@link NutchDocument} object
+   * @param parse
+   *          The relevant {@link Parse} object passing through the filter
+   * @param url
+   *          URL to be filtered for anchor text
+   * @param datum
+   *          The {@link CrawlDatum} entry
+   * @param inlinks
+   *          The {@link Inlinks} containing anchor text
+   * @return filtered NutchDocument
+   */
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
 
     Text reprUrl = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
     String reprUrlString = reprUrl != null ? reprUrl.toString() : null;
     String urlString = url.toString();
-    
+
     String host = null;
     try {
       URL u;
@@ -83,11 +89,11 @@ public class BasicIndexingFilter impleme
       } else {
         u = new URL(urlString);
       }
-      
+
       if (addDomain) {
         doc.add("domain", URLUtil.getDomainName(u));
       }
-      
+
       host = u.getHost();
     } catch (MalformedURLException e) {
       throw new IndexingException(e);
@@ -108,7 +114,10 @@ public class BasicIndexingFilter impleme
 
     // title
     String title = parse.getData().getTitle();
-    if (MAX_TITLE_LENGTH > -1 && title.length() > MAX_TITLE_LENGTH) {      // truncate title if needed
+    if (MAX_TITLE_LENGTH > -1 && title.length() > MAX_TITLE_LENGTH) { // truncate
+                                                                      // title
+                                                                      // if
+                                                                      // needed
       title = title.substring(0, MAX_TITLE_LENGTH);
     }
 

Modified: nutch/trunk/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java Thu Jan 29 05:38:59 2015
@@ -34,20 +34,20 @@ import org.junit.Test;
 import java.util.Date;
 
 /**
- * JUnit test case which tests 
- * 1. that basic searchable fields are added to a document
- * 2. that domain is added as per {@code indexer.add.domain} in nutch-default.xml.
- * 3. that title is truncated as per {@code indexer.max.title.length} in nutch-default.xml.
- * 4. that content is truncated as per {@code indexer.max.content.length} in nutch-default.xml.
+ * JUnit test case which tests 1. that basic searchable fields are added to a
+ * document 2. that domain is added as per {@code indexer.add.domain} in
+ * nutch-default.xml. 3. that title is truncated as per
+ * {@code indexer.max.title.length} in nutch-default.xml. 4. that content is
+ * truncated as per {@code indexer.max.content.length} in nutch-default.xml.
  * 
  * @author tejasp
- *
+ * 
  */
 
 public class TestBasicIndexingFilter {
 
   @Test
-  public void testBasicIndexingFilter() throws Exception { 
+  public void testBasicIndexingFilter() throws Exception {
     Configuration conf = NutchConfiguration.create();
     conf.setInt("indexer.max.title.length", 10);
     conf.setBoolean("indexer.add.domain", true);
@@ -63,8 +63,10 @@ public class TestBasicIndexingFilter {
     Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
     Metadata metaData = new Metadata();
     metaData.add("Language", "en/us");
-    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
-    ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);
+    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
+        outlinks, metaData);
+    ParseImpl parse = new ParseImpl(
+        "this is a sample foo bar page. hope you enjoy it.", parseData);
 
     CrawlDatum crawlDatum = new CrawlDatum();
     crawlDatum.setFetchTime(100L);
@@ -72,18 +74,26 @@ public class TestBasicIndexingFilter {
     Inlinks inlinks = new Inlinks();
 
     try {
-      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
-    } catch(Exception e){
+      filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
+          crawlDatum, inlinks);
+    } catch (Exception e) {
       e.printStackTrace();
       Assert.fail(e.getMessage());
     }
     Assert.assertNotNull(doc);
-    Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0));
-    Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0));
-    Assert.assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0));
-    Assert.assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html", 
-      doc.getField("url").getValues().get(0));
-    Assert.assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0));
-    Assert.assertEquals("test fetch time", new Date(100L), (Date)doc.getField("tstamp").getValues().get(0));
+    Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc
+        .getField("title").getValues().get(0));
+    Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc
+        .getField("domain").getValues().get(0));
+    Assert.assertEquals("test host, expect \"nutch.apache.org\"",
+        "nutch.apache.org", doc.getField("host").getValues().get(0));
+    Assert.assertEquals(
+        "test url, expect \"http://nutch.apache.org/index.html\"",
+        "http://nutch.apache.org/index.html", doc.getField("url").getValues()
+            .get(0));
+    Assert.assertEquals("test content", "this is a sample foo",
+        doc.getField("content").getValues().get(0));
+    Assert.assertEquals("test fetch time", new Date(100L),
+        (Date) doc.getField("tstamp").getValues().get(0));
   }
 }

Modified: nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java (original)
+++ nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java Thu Jan 29 05:38:59 2015
@@ -41,13 +41,16 @@ import com.maxmind.geoip2.record.Subdivi
 import com.maxmind.geoip2.record.Traits;
 
 /**
- * <p>Simple utility class which enables efficient, structured
- * {@link org.apache.nutch.indexer.NutchDocument} building based on input 
- * from {@link GeoIPIndexingFilter}, where configuration is also read.</p>
- * <p>Based on the nature of the input, this class wraps factory type
- * implementations for populating {@link org.apache.nutch.indexer.NutchDocument}'s
- * with the correct {@link org.apache.nutch.indexer.NutchField} information.
- *
+ * <p>
+ * Simple utility class which enables efficient, structured
+ * {@link org.apache.nutch.indexer.NutchDocument} building based on input from
+ * {@link GeoIPIndexingFilter}, where configuration is also read.
+ * </p>
+ * <p>
+ * Based on the nature of the input, this class wraps factory type
+ * implementations for populating {@link org.apache.nutch.indexer.NutchDocument}
+ * 's with the correct {@link org.apache.nutch.indexer.NutchField} information.
+ * 
  */
 public class GeoIPDocumentCreator {
 
@@ -58,13 +61,15 @@ public class GeoIPDocumentCreator {
   }
 
   public static NutchDocument createDocFromInsightsService(String serverIp,
-      NutchDocument doc, WebServiceClient client) throws UnknownHostException, IOException, GeoIp2Exception {
+      NutchDocument doc, WebServiceClient client) throws UnknownHostException,
+      IOException, GeoIp2Exception {
     doc.add("ip", serverIp);
-    InsightsResponse response = client.insights(InetAddress.getByName(serverIp));
-    //CityResponse response = client.city(InetAddress.getByName(serverIp));
-    
+    InsightsResponse response = client
+        .insights(InetAddress.getByName(serverIp));
+    // CityResponse response = client.city(InetAddress.getByName(serverIp));
+
     City city = response.getCity();
-    doc.add("cityName", city.getName());       // 'Minneapolis'
+    doc.add("cityName", city.getName()); // 'Minneapolis'
     doc.add("cityConfidence", city.getConfidence()); // 50
     doc.add("cityGeoNameId", city.getGeoNameId());
 
@@ -74,31 +79,32 @@ public class GeoIPDocumentCreator {
     doc.add("continentName", continent.getName());
 
     Country country = response.getCountry();
-    doc.add("countryIsoCode", country.getIsoCode());            // 'US'
-    doc.add("countryName", country.getName());               // 'United States'
-    doc.add("countryConfidence", country.getConfidence());         // 99
+    doc.add("countryIsoCode", country.getIsoCode()); // 'US'
+    doc.add("countryName", country.getName()); // 'United States'
+    doc.add("countryConfidence", country.getConfidence()); // 99
     doc.add("countryGeoName", country.getGeoNameId());
 
     Location location = response.getLocation();
-    doc.add("latLon", location.getLatitude() + "," + location.getLongitude());    // 44.9733, -93.2323
-    doc.add("accRadius", location.getAccuracyRadius());  // 3
-    doc.add("timeZone", location.getTimeZone());        // 'America/Chicago'
+    doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
+                                                                               // -93.2323
+    doc.add("accRadius", location.getAccuracyRadius()); // 3
+    doc.add("timeZone", location.getTimeZone()); // 'America/Chicago'
     doc.add("metroCode", location.getMetroCode());
 
     Postal postal = response.getPostal();
-    doc.add("postalCode", postal.getCode());       // '55455'
+    doc.add("postalCode", postal.getCode()); // '55455'
     doc.add("postalConfidence", postal.getConfidence()); // 40
 
     RepresentedCountry rCountry = response.getRepresentedCountry();
     doc.add("countryType", rCountry.getType());
 
     Subdivision subdivision = response.getMostSpecificSubdivision();
-    doc.add("subDivName", subdivision.getName());       // 'Minnesota'
-    doc.add("subDivIdoCode", subdivision.getIsoCode());    // 'MN'
+    doc.add("subDivName", subdivision.getName()); // 'Minnesota'
+    doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN'
     doc.add("subDivConfidence", subdivision.getConfidence()); // 90
     doc.add("subDivGeoNameId", subdivision.getGeoNameId());
 
-    Traits traits = response.getTraits(); 
+    Traits traits = response.getTraits();
     doc.add("autonSystemNum", traits.getAutonomousSystemNumber());
     doc.add("autonSystemOrg", traits.getAutonomousSystemOrganization());
     doc.add("domain", traits.getDomain());
@@ -112,20 +118,23 @@ public class GeoIPDocumentCreator {
 
   @SuppressWarnings("unused")
   public static NutchDocument createDocFromCityService(String serverIp,
-      NutchDocument doc, WebServiceClient client) throws UnknownHostException, IOException, GeoIp2Exception {
+      NutchDocument doc, WebServiceClient client) throws UnknownHostException,
+      IOException, GeoIp2Exception {
     CityResponse response = client.city(InetAddress.getByName(serverIp));
     return doc;
   }
 
   @SuppressWarnings("unused")
   public static NutchDocument createDocFromCountryService(String serverIp,
-      NutchDocument doc, WebServiceClient client) throws UnknownHostException, IOException, GeoIp2Exception {
-    CountryResponse response = client.country(InetAddress.getByName(serverIp));    
+      NutchDocument doc, WebServiceClient client) throws UnknownHostException,
+      IOException, GeoIp2Exception {
+    CountryResponse response = client.country(InetAddress.getByName(serverIp));
     return doc;
   }
 
-  public static NutchDocument createDocFromIspDb(String serverIp, NutchDocument doc, 
-      DatabaseReader reader) throws UnknownHostException, IOException, GeoIp2Exception {
+  public static NutchDocument createDocFromIspDb(String serverIp,
+      NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
+      IOException, GeoIp2Exception {
     IspResponse response = reader.isp(InetAddress.getByName(serverIp));
     doc.add("ip", serverIp);
     doc.add("autonSystemNum", response.getAutonomousSystemNumber());
@@ -135,8 +144,9 @@ public class GeoIPDocumentCreator {
     return doc;
   }
 
-  public static NutchDocument createDocFromDomainDb(String serverIp, NutchDocument doc, 
-      DatabaseReader reader) throws UnknownHostException, IOException, GeoIp2Exception {
+  public static NutchDocument createDocFromDomainDb(String serverIp,
+      NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
+      IOException, GeoIp2Exception {
     DomainResponse response = reader.domain(InetAddress.getByName(serverIp));
     doc.add("ip", serverIp);
     doc.add("domain", response.getDomain());
@@ -144,20 +154,23 @@ public class GeoIPDocumentCreator {
   }
 
   public static NutchDocument createDocFromConnectionDb(String serverIp,
-      NutchDocument doc, DatabaseReader reader) throws UnknownHostException, IOException, GeoIp2Exception {
-    ConnectionTypeResponse response = reader.connectionType(InetAddress.getByName(serverIp));
+      NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
+      IOException, GeoIp2Exception {
+    ConnectionTypeResponse response = reader.connectionType(InetAddress
+        .getByName(serverIp));
     doc.add("ip", serverIp);
     doc.add("connType", response.getConnectionType().toString());
     return doc;
   }
 
-  public static NutchDocument createDocFromCityDb(String serverIp, NutchDocument doc, 
-      DatabaseReader reader) throws UnknownHostException, IOException, GeoIp2Exception {
+  public static NutchDocument createDocFromCityDb(String serverIp,
+      NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
+      IOException, GeoIp2Exception {
     doc.add("ip", serverIp);
     CityResponse response = reader.city(InetAddress.getByName(serverIp));
 
     City city = response.getCity();
-    doc.add("cityName", city.getName());       // 'Minneapolis'
+    doc.add("cityName", city.getName()); // 'Minneapolis'
     doc.add("cityConfidence", city.getConfidence()); // 50
     doc.add("cityGeoNameId", city.getGeoNameId());
 
@@ -167,27 +180,28 @@ public class GeoIPDocumentCreator {
     doc.add("continentName", continent.getName());
 
     Country country = response.getCountry();
-    doc.add("countryIsoCode", country.getIsoCode());            // 'US'
-    doc.add("countryName", country.getName());               // 'United States'
-    doc.add("countryConfidence", country.getConfidence());         // 99
+    doc.add("countryIsoCode", country.getIsoCode()); // 'US'
+    doc.add("countryName", country.getName()); // 'United States'
+    doc.add("countryConfidence", country.getConfidence()); // 99
     doc.add("countryGeoName", country.getGeoNameId());
 
     Location location = response.getLocation();
-    doc.add("latLon", location.getLatitude() + "," + location.getLongitude());    // 44.9733, -93.2323
-    doc.add("accRadius", location.getAccuracyRadius());  // 3
-    doc.add("timeZone", location.getTimeZone());        // 'America/Chicago'
+    doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
+                                                                               // -93.2323
+    doc.add("accRadius", location.getAccuracyRadius()); // 3
+    doc.add("timeZone", location.getTimeZone()); // 'America/Chicago'
     doc.add("metroCode", location.getMetroCode());
 
     Postal postal = response.getPostal();
-    doc.add("postalCode", postal.getCode());       // '55455'
+    doc.add("postalCode", postal.getCode()); // '55455'
     doc.add("postalConfidence", postal.getConfidence()); // 40
 
     RepresentedCountry rCountry = response.getRepresentedCountry();
     doc.add("countryType", rCountry.getType());
 
     Subdivision subdivision = response.getMostSpecificSubdivision();
-    doc.add("subDivName", subdivision.getName());       // 'Minnesota'
-    doc.add("subDivIdoCode", subdivision.getIsoCode());    // 'MN'
+    doc.add("subDivName", subdivision.getName()); // 'Minnesota'
+    doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN'
     doc.add("subDivConfidence", subdivision.getConfidence()); // 90
     doc.add("subDivGeoNameId", subdivision.getGeoNameId());
     return doc;

Modified: nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java Thu Jan 29 05:38:59 2015
@@ -34,16 +34,22 @@ import com.maxmind.geoip2.DatabaseReader
 import com.maxmind.geoip2.WebServiceClient;
 
 /**
- * <p>This plugin implements an indexing filter which takes 
- * advantage of the 
- * <a href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>.</p>
- * <p>The third party library distribution provides an API for the GeoIP2 
- * <a href="http://dev.maxmind.com/geoip/geoip2/web-services">Precision web services</a> 
- * and <a href="http://dev.maxmind.com/geoip/geoip2/downloadable">databases</a>. 
- * The API also works with the free 
- * <a href="http://dev.maxmind.com/geoip/geoip2/geolite2/">GeoLite2 databases</a>.</p>
- * <p>Depending on the service level agreement, you have with the GeoIP service provider,
- * the plugin can add a number of the following fields to the index data model:
+ * <p>
+ * This plugin implements an indexing filter which takes advantage of the <a
+ * href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>.
+ * </p>
+ * <p>
+ * The third party library distribution provides an API for the GeoIP2 <a
+ * href="http://dev.maxmind.com/geoip/geoip2/web-services">Precision web
+ * services</a> and <a
+ * href="http://dev.maxmind.com/geoip/geoip2/downloadable">databases</a>. The
+ * API also works with the free <a
+ * href="http://dev.maxmind.com/geoip/geoip2/geolite2/">GeoLite2 databases</a>.
+ * </p>
+ * <p>
+ * Depending on the service level agreement, you have with the GeoIP service
+ * provider, the plugin can add a number of the following fields to the index
+ * data model:
  * <ol>
  * <li>Continent</li>
  * <li>Country</li>
@@ -56,51 +62,59 @@ import com.maxmind.geoip2.WebServiceClie
  * <li>Confidence Factors</li>
  * <li>Radius</li>
  * <li>User Type</li>
- * </ol></p>
+ * </ol>
+ * </p>
  * 
- * <p>Some of the services are documented at the 
- * <a href="https://www.maxmind.com/en/geoip2-precision-services">GeoIP2 Precision Services</a>
- * webpage where more information can be obtained.</p>
+ * <p>
+ * Some of the services are documented at the <a
+ * href="https://www.maxmind.com/en/geoip2-precision-services">GeoIP2 Precision
+ * Services</a> webpage where more information can be obtained.
+ * </p>
+ * 
+ * <p>
+ * You should also consult the following three properties in
+ * <code>nutch-site.xml</code>
+ * </p>
  * 
- * <p>You should also consult the following three properties in <code>nutch-site.xml</code></p>
  * <pre>
- * {@code
- *<!-- index-geoip plugin properties -->
-<property>
-  <name>index.geoip.usage</name>
-  <value>insightsService</value>
-  <description>
-  A string representing the information source to be used for GeoIP information
-  association. Either enter 'cityDatabase', 'connectionTypeDatabase', 
-  'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the 
-  Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb, 
-  GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the Hadoop classpath 
-  and available at runtime. This can be achieved by adding it to $NUTCH_HOME/conf
-  </description>
-</property>
-
-<property>
-  <name>index.geoip.userid</name>
-  <value></value>
-  <description>
-  The userId associated with the GeoIP2 Precision Services account.
-  </description>
-</property>
-
-<property>
-  <name>index.geoip.licensekey</name>
-  <value></value>
-  <description>
-  The license key associated with the GeoIP2 Precision Services account.
-  </description>
-</property>
-}
+ *  {@code
+ * <!-- index-geoip plugin properties -->
+ * <property>
+ *   <name>index.geoip.usage</name>
+ *   <value>insightsService</value>
+ *   <description>
+ *   A string representing the information source to be used for GeoIP information
+ *   association. Either enter 'cityDatabase', 'connectionTypeDatabase', 
+ *   'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the 
+ *   Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb, 
+ *   GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the Hadoop classpath 
+ *   and available at runtime. This can be achieved by adding it to $NUTCH_HOME/conf
+ *   </description>
+ * </property>
+ * 
+ * <property>
+ *   <name>index.geoip.userid</name>
+ *   <value></value>
+ *   <description>
+ *   The userId associated with the GeoIP2 Precision Services account.
+ *   </description>
+ * </property>
+ * 
+ * <property>
+ *   <name>index.geoip.licensekey</name>
+ *   <value></value>
+ *   <description>
+ *   The license key associated with the GeoIP2 Precision Services account.
+ *   </description>
+ * </property>
+ * }
  * </pre>
  * 
  */
 public class GeoIPIndexingFilter implements IndexingFilter {
 
-  private static final Logger LOG = LoggerFactory.getLogger(GeoIPIndexingFilter.class);
+  private static final Logger LOG = LoggerFactory
+      .getLogger(GeoIPIndexingFilter.class);
 
   private Configuration conf;
 
@@ -112,7 +126,7 @@ public class GeoIPIndexingFilter impleme
 
   DatabaseReader reader = null;
 
-  //private AbstractResponse response = null;
+  // private AbstractResponse response = null;
 
   /**
    * Default constructor for this plugin
@@ -145,7 +159,8 @@ public class GeoIPIndexingFilter impleme
       }
     } else if (use.equalsIgnoreCase("connectionTypeDatabase")) {
       try {
-        geoDb = new File(conf.getResource("GeoIP2-Connection-Type.mmdb").getFile());
+        geoDb = new File(conf.getResource("GeoIP2-Connection-Type.mmdb")
+            .getFile());
         buildDb();
       } catch (Exception e) {
         LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
@@ -165,8 +180,8 @@ public class GeoIPIndexingFilter impleme
         LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
       }
     } else if (use.equalsIgnoreCase("insightsService")) {
-      client = new WebServiceClient.Builder(
-          conf.getInt("index.geoip.userid", 12345), conf.get("index.geoip.licensekey")).build();
+      client = new WebServiceClient.Builder(conf.getInt("index.geoip.userid",
+          12345), conf.get("index.geoip.licensekey")).build();
     }
     usage = use;
   }
@@ -181,7 +196,9 @@ public class GeoIPIndexingFilter impleme
 
   /**
    * 
-   * @see org.apache.nutch.indexer.IndexingFilter#filter(org.apache.nutch.indexer.NutchDocument, org.apache.nutch.parse.Parse, org.apache.hadoop.io.Text, org.apache.nutch.crawl.CrawlDatum, org.apache.nutch.crawl.Inlinks)
+   * @see org.apache.nutch.indexer.IndexingFilter#filter(org.apache.nutch.indexer.NutchDocument,
+   *      org.apache.nutch.parse.Parse, org.apache.hadoop.io.Text,
+   *      org.apache.nutch.crawl.CrawlDatum, org.apache.nutch.crawl.Inlinks)
    */
   @Override
   public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
@@ -189,22 +206,28 @@ public class GeoIPIndexingFilter impleme
     return addServerGeo(doc, parse.getData(), url.toString());
   }
 
-  private NutchDocument addServerGeo(NutchDocument doc, ParseData data, String url) {
+  private NutchDocument addServerGeo(NutchDocument doc, ParseData data,
+      String url) {
 
     if (conf.getBoolean("store.ip.address", false) == true) {
       try {
         String serverIp = data.getContentMeta().get("_ip_");
         if (serverIp != null) {
           if (usage.equalsIgnoreCase("cityDatabase")) {
-            doc = GeoIPDocumentCreator.createDocFromCityDb(serverIp, doc, reader);
+            doc = GeoIPDocumentCreator.createDocFromCityDb(serverIp, doc,
+                reader);
           } else if (usage.equalsIgnoreCase("connectionTypeDatabase")) {
-            doc = GeoIPDocumentCreator.createDocFromConnectionDb(serverIp, doc, reader);
+            doc = GeoIPDocumentCreator.createDocFromConnectionDb(serverIp, doc,
+                reader);
           } else if (usage.equalsIgnoreCase("domainDatabase")) {
-            doc = GeoIPDocumentCreator.createDocFromDomainDb(serverIp, doc, reader);
+            doc = GeoIPDocumentCreator.createDocFromDomainDb(serverIp, doc,
+                reader);
           } else if (usage.equalsIgnoreCase("ispDatabase")) {
-            doc = GeoIPDocumentCreator.createDocFromIspDb(serverIp, doc, reader);
+            doc = GeoIPDocumentCreator
+                .createDocFromIspDb(serverIp, doc, reader);
           } else if (usage.equalsIgnoreCase("insightsService")) {
-            doc = GeoIPDocumentCreator.createDocFromInsightsService(serverIp, doc, client);
+            doc = GeoIPDocumentCreator.createDocFromInsightsService(serverIp,
+                doc, client);
           }
         }
       } catch (Exception e) {

Modified: nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java (original)
+++ nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
  * Metadata may come from CrawlDb, parse or content metadata.
  */
 package org.apache.nutch.indexer.metadata;
+