You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/29 06:39:03 UTC
svn commit: r1655526 [13/26] - in /nutch/trunk: ./
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/
src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/p...
Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffix.java Thu Jan 29 05:38:59 2015
@@ -18,29 +18,29 @@
package org.apache.nutch.util.domain;
/**
- * This class represents the last part of the host name,
- * which is operated by authoritives, not individuals. This information
- * is needed to find the domain name of a host. The domain name of a host
- * is defined to be the last part before the domain suffix, w/o subdomain
- * names. As an example the domain name of <br><code> http://lucene.apache.org/
- * </code><br> is <code> apache.org</code>
- * <br>
- * This class holds three fields,
- * <strong>domain</strong> field represents the suffix (such as "co.uk")
- * <strong>boost</strong> is a float for boosting score of url's with this suffix
- * <strong>status</strong> field represents domain's status
+ * This class represents the last part of the host name, which is operated by
+ * authoritives, not individuals. This information is needed to find the domain
+ * name of a host. The domain name of a host is defined to be the last part
+ * before the domain suffix, w/o subdomain names. As an example the domain name
+ * of <br>
+ * <code> http://lucene.apache.org/
+ * </code><br>
+ * is <code> apache.org</code> <br>
+ * This class holds three fields, <strong>domain</strong> field represents the
+ * suffix (such as "co.uk") <strong>boost</strong> is a float for boosting score
+ * of url's with this suffix <strong>status</strong> field represents domain's
+ * status
*
* @author Enis Soztutar <enis.soz.nutch@gmail.com>
- * @see TopLevelDomain
- * for info please see conf/domain-suffixes.xml
+ * @see TopLevelDomain for info please see conf/domain-suffixes.xml
*/
public class DomainSuffix {
/**
- * Enumeration of the status of the tld. Please see domain-suffixes.xml.
+ * Enumeration of the status of the tld. Please see domain-suffixes.xml.
*/
- public enum Status { INFRASTRUCTURE, SPONSORED, UNSPONSORED
- , STARTUP, PROPOSED, DELETED, PSEUDO_DOMAIN, DEPRECATED, IN_USE, NOT_IN_USE, REJECTED
+ public enum Status {
+ INFRASTRUCTURE, SPONSORED, UNSPONSORED, STARTUP, PROPOSED, DELETED, PSEUDO_DOMAIN, DEPRECATED, IN_USE, NOT_IN_USE, REJECTED
};
private String domain;
@@ -49,7 +49,7 @@ public class DomainSuffix {
public static final float DEFAULT_BOOST = 1.0f;
public static final Status DEFAULT_STATUS = Status.IN_USE;
-
+
public DomainSuffix(String domain, Status status, float boost) {
this.domain = domain;
this.status = status;
@@ -59,7 +59,7 @@ public class DomainSuffix {
public DomainSuffix(String domain) {
this(domain, DEFAULT_STATUS, DEFAULT_BOOST);
}
-
+
public String getDomain() {
return domain;
}
@@ -71,7 +71,7 @@ public class DomainSuffix {
public float getBoost() {
return boost;
}
-
+
@Override
public String toString() {
return domain;
Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixes.java Thu Jan 29 05:38:59 2015
@@ -25,57 +25,62 @@ import org.slf4j.LoggerFactory;
import org.apache.hadoop.util.StringUtils;
/**
- * Storage class for <code>DomainSuffix</code> objects
- * Note: this class is singleton
+ * Storage class for <code>DomainSuffix</code> objects Note: this class is
+ * singleton
+ *
* @author Enis Soztutar <enis.soz.nutch@gmail.com>
*/
public class DomainSuffixes {
- private static final Logger LOG = LoggerFactory.getLogger(DomainSuffixes.class);
-
- private HashMap<String, DomainSuffix> domains = new HashMap<String, DomainSuffix>();
-
+ private static final Logger LOG = LoggerFactory
+ .getLogger(DomainSuffixes.class);
+
+ private HashMap<String, DomainSuffix> domains = new HashMap<String, DomainSuffix>();
+
private static DomainSuffixes instance;
-
+
/** private ctor */
private DomainSuffixes() {
String file = "domain-suffixes.xml";
- InputStream input = this.getClass().getClassLoader().getResourceAsStream(file);
+ InputStream input = this.getClass().getClassLoader()
+ .getResourceAsStream(file);
try {
new DomainSuffixesReader().read(this, input);
- }
- catch (Exception ex) {
+ } catch (Exception ex) {
LOG.warn(StringUtils.stringifyException(ex));
}
}
-
+
/**
* Singleton instance, lazy instantination
- * @return returns the domain suffix instance
+ *
+ * @return returns the domain suffix instance
*/
public static DomainSuffixes getInstance() {
- if(instance == null) {
+ if (instance == null) {
instance = new DomainSuffixes();
}
return instance;
}
-
+
void addDomainSuffix(DomainSuffix tld) {
domains.put(tld.getDomain(), tld);
}
/** return whether the extension is a registered domain entry */
public boolean isDomainSuffix(String extension) {
- return domains.containsKey(extension);
+ return domains.containsKey(extension);
}
-
+
/**
- * Return the {@link DomainSuffix} object for the extension, if
- * extension is a top level domain returned object will be an
- * instance of {@link TopLevelDomain}
- * @param extension of the domain
+ * Return the {@link DomainSuffix} object for the extension, if extension is a
+ * top level domain returned object will be an instance of
+ * {@link TopLevelDomain}
+ *
+ * @param extension
+ * of the domain
*/
public DomainSuffix get(String extension) {
return domains.get(extension);
}
-
+
}
Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/domain/DomainSuffixesReader.java Thu Jan 29 05:38:59 2015
@@ -36,16 +36,17 @@ import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
/**
- * For parsing xml files containing domain suffix definitions.
- * Parsed xml files should validate against
- * <code>domain-suffixes.xsd</code>
+ * For parsing xml files containing domain suffix definitions. Parsed xml files
+ * should validate against <code>domain-suffixes.xsd</code>
+ *
* @author Enis Soztutar <enis.soz.nutch@gmail.com>
*/
class DomainSuffixesReader {
- private static final Logger LOG = LoggerFactory.getLogger(DomainSuffixesReader.class);
+ private static final Logger LOG = LoggerFactory
+ .getLogger(DomainSuffixesReader.class);
- void read(DomainSuffixes tldEntries, InputStream input) throws IOException{
+ void read(DomainSuffixes tldEntries, InputStream input) throws IOException {
try {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
@@ -54,28 +55,29 @@ class DomainSuffixesReader {
Document document = builder.parse(new InputSource(input));
Element root = document.getDocumentElement();
-
- if(root != null && root.getTagName().equals("domains")) {
-
- Element tlds = (Element)root.getElementsByTagName("tlds").item(0);
- Element suffixes = (Element)root.getElementsByTagName("suffixes").item(0);
-
- //read tlds
- readITLDs(tldEntries, (Element)tlds.getElementsByTagName("itlds").item(0));
- readGTLDs(tldEntries, (Element)tlds.getElementsByTagName("gtlds").item(0));
- readCCTLDs(tldEntries, (Element)tlds.getElementsByTagName("cctlds").item(0));
-
+
+ if (root != null && root.getTagName().equals("domains")) {
+
+ Element tlds = (Element) root.getElementsByTagName("tlds").item(0);
+ Element suffixes = (Element) root.getElementsByTagName("suffixes")
+ .item(0);
+
+ // read tlds
+ readITLDs(tldEntries, (Element) tlds.getElementsByTagName("itlds")
+ .item(0));
+ readGTLDs(tldEntries, (Element) tlds.getElementsByTagName("gtlds")
+ .item(0));
+ readCCTLDs(tldEntries, (Element) tlds.getElementsByTagName("cctlds")
+ .item(0));
+
readSuffixes(tldEntries, suffixes);
- }
- else {
+ } else {
throw new IOException("xml file is not valid");
}
- }
- catch (ParserConfigurationException ex) {
+ } catch (ParserConfigurationException ex) {
LOG.warn(StringUtils.stringifyException(ex));
throw new IOException(ex.getMessage());
- }
- catch (SAXException ex) {
+ } catch (SAXException ex) {
LOG.warn(StringUtils.stringifyException(ex));
throw new IOException(ex.getMessage());
}
@@ -83,22 +85,24 @@ class DomainSuffixesReader {
void readITLDs(DomainSuffixes tldEntries, Element el) {
NodeList children = el.getElementsByTagName("tld");
- for(int i=0;i<children.getLength();i++) {
- tldEntries.addDomainSuffix(readGTLD((Element)children.item(i), Type.INFRASTRUCTURE));
+ for (int i = 0; i < children.getLength(); i++) {
+ tldEntries.addDomainSuffix(readGTLD((Element) children.item(i),
+ Type.INFRASTRUCTURE));
}
}
-
+
void readGTLDs(DomainSuffixes tldEntries, Element el) {
NodeList children = el.getElementsByTagName("tld");
- for(int i=0;i<children.getLength();i++) {
- tldEntries.addDomainSuffix(readGTLD((Element)children.item(i), Type.GENERIC));
+ for (int i = 0; i < children.getLength(); i++) {
+ tldEntries.addDomainSuffix(readGTLD((Element) children.item(i),
+ Type.GENERIC));
}
}
void readCCTLDs(DomainSuffixes tldEntries, Element el) throws IOException {
NodeList children = el.getElementsByTagName("tld");
- for(int i=0;i<children.getLength();i++) {
- tldEntries.addDomainSuffix(readCCTLD((Element)children.item(i)));
+ for (int i = 0; i < children.getLength(); i++) {
+ tldEntries.addDomainSuffix(readCCTLD((Element) children.item(i)));
}
}
@@ -113,39 +117,40 @@ class DomainSuffixesReader {
String domain = el.getAttribute("domain");
Status status = readStatus(el);
float boost = readBoost(el);
- String countryName = readCountryName(el);
- return new TopLevelDomain(domain, status, boost, countryName);
+ String countryName = readCountryName(el);
+ return new TopLevelDomain(domain, status, boost, countryName);
}
-
+
/** read optional field status */
Status readStatus(Element el) {
NodeList list = el.getElementsByTagName("status");
- if(list == null || list.getLength() == 0)
+ if (list == null || list.getLength() == 0)
return DomainSuffix.DEFAULT_STATUS;
return Status.valueOf(list.item(0).getFirstChild().getNodeValue());
}
-
+
/** read optional field boost */
float readBoost(Element el) {
NodeList list = el.getElementsByTagName("boost");
- if(list == null || list.getLength() == 0)
+ if (list == null || list.getLength() == 0)
return DomainSuffix.DEFAULT_BOOST;
return Float.parseFloat(list.item(0).getFirstChild().getNodeValue());
}
-
- /** read field countryname
- */
+
+ /**
+ * read field countryname
+ */
String readCountryName(Element el) throws IOException {
NodeList list = el.getElementsByTagName("country");
- if(list == null || list.getLength() == 0)
+ if (list == null || list.getLength() == 0)
throw new IOException("Country name should be given");
return list.item(0).getNodeValue();
}
-
+
void readSuffixes(DomainSuffixes tldEntries, Element el) {
NodeList children = el.getElementsByTagName("suffix");
- for(int i=0;i<children.getLength();i++) {
- tldEntries.addDomainSuffix(readSuffix((Element)children.item(i)));
+ for (int i = 0; i < children.getLength(); i++) {
+ tldEntries.addDomainSuffix(readSuffix((Element) children.item(i)));
}
}
@@ -155,5 +160,5 @@ class DomainSuffixesReader {
float boost = readBoost(el);
return new DomainSuffix(domain, status, boost);
}
-
+
}
Modified: nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/domain/TopLevelDomain.java Thu Jan 29 05:38:59 2015
@@ -18,44 +18,50 @@
package org.apache.nutch.util.domain;
/**
- * (From wikipedia) A top-level domain (TLD) is the last part of an
- * Internet domain name; that is, the letters which follow the final
- * dot of any domain name. For example, in the domain name
- * <code>www.website.com</code>, the top-level domain is <code>com</code>.
- *
+ * (From wikipedia) A top-level domain (TLD) is the last part of an Internet
+ * domain name; that is, the letters which follow the final dot of any domain
+ * name. For example, in the domain name <code>www.website.com</code>, the
+ * top-level domain is <code>com</code>.
+ *
* @author Enis Soztutar <enis.soz.nutch@gmail.com>
*
* @see <a href="http://www.iana.org/"> iana.org</a>
*
- * @see <a href="http://en.wikipedia.org/wiki/Top-level_domain"> Top-level_domain</a>
+ * @see <a href="http://en.wikipedia.org/wiki/Top-level_domain">
+ * Top-level_domain</a>
*/
public class TopLevelDomain extends DomainSuffix {
- public enum Type { INFRASTRUCTURE, GENERIC, COUNTRY };
-
+ public enum Type {
+ INFRASTRUCTURE, GENERIC, COUNTRY
+ };
+
private Type type;
private String countryName = null;
-
- public TopLevelDomain(String domain, Type type, Status status, float boost){
+
+ public TopLevelDomain(String domain, Type type, Status status, float boost) {
super(domain, status, boost);
this.type = type;
}
- public TopLevelDomain(String domain, Status status, float boost, String countryName){
+ public TopLevelDomain(String domain, Status status, float boost,
+ String countryName) {
super(domain, status, boost);
this.type = Type.COUNTRY;
this.countryName = countryName;
}
-
+
public Type getType() {
return type;
}
- /** Returns the country name if TLD is Country Code TLD
+ /**
+ * Returns the country name if TLD is Country Code TLD
+ *
* @return country name or null
- */
- public String getCountryName(){
+ */
+ public String getCountryName() {
return countryName;
}
-
+
}
Modified: nutch/trunk/src/java/org/apache/nutch/util/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/package-info.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/package-info.java Thu Jan 29 05:38:59 2015
@@ -19,3 +19,4 @@
* Miscellaneous utility classes.
*/
package org.apache.nutch.util;
+
Modified: nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java Thu Jan 29 05:38:59 2015
@@ -41,16 +41,17 @@ import java.net.MalformedURLException;
/** Adds basic searchable fields to a document. */
public class CCIndexingFilter implements IndexingFilter {
- public static final Logger LOG = LoggerFactory.getLogger(CCIndexingFilter.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(CCIndexingFilter.class);
/** The name of the document field we use. */
public static String FIELD = "cc";
private Configuration conf;
- public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
- throws IndexingException {
-
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
Metadata metadata = parse.getData().getParseMeta();
// index the license
String licenseUrl = metadata.get(CreativeCommons.LICENSE_URL);
@@ -81,9 +82,11 @@ public class CCIndexingFilter implements
return doc;
}
- /** Add the features represented by a license URL. Urls are of the form
+ /**
+ * Add the features represented by a license URL. Urls are of the form
* "http://creativecommons.org/licenses/xx-xx/xx/xx", where "xx" names a
- * license feature. */
+ * license feature.
+ */
public void addUrlFeatures(NutchDocument doc, String urlString) {
try {
URL url = new URL(urlString);
@@ -92,7 +95,7 @@ public class CCIndexingFilter implements
StringTokenizer names = new StringTokenizer(url.getPath(), "/-");
if (names.hasMoreTokens())
- names.nextToken(); // throw away "licenses"
+ names.nextToken(); // throw away "licenses"
// add a feature per component after "licenses"
while (names.hasMoreTokens()) {
@@ -105,7 +108,7 @@ public class CCIndexingFilter implements
}
}
}
-
+
private void addFeature(NutchDocument doc, String feature) {
doc.add(FIELD, feature);
}
Modified: nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (original)
+++ nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java Thu Jan 29 05:38:59 2015
@@ -33,27 +33,25 @@ import javax.xml.parsers.*;
import org.xml.sax.InputSource;
import org.w3c.dom.*;
-
/** Adds metadata identifying the Creative Commons license used, if any. */
public class CCParseFilter implements HtmlParseFilter {
public static final Logger LOG = LoggerFactory.getLogger(CCParseFilter.class);
-
- /** Walks DOM tree, looking for RDF in comments and licenses in anchors.*/
+ /** Walks DOM tree, looking for RDF in comments and licenses in anchors. */
public static class Walker {
- private URL base; // base url of page
- private String rdfLicense; // subject url found, if any
- private URL relLicense; // license url found, if any
- private URL anchorLicense; // anchor url found, if any
- private String workType; // work type URI
+ private URL base; // base url of page
+ private String rdfLicense; // subject url found, if any
+ private URL relLicense; // license url found, if any
+ private URL anchorLicense; // anchor url found, if any
+ private String workType; // work type URI
private Walker(URL base) {
this.base = base;
}
- /** Scan the document adding attributes to metadata.*/
- public static void walk(Node doc, URL base, Metadata metadata, Configuration conf)
- throws ParseException {
+ /** Scan the document adding attributes to metadata. */
+ public static void walk(Node doc, URL base, Metadata metadata,
+ Configuration conf) throws ParseException {
// walk the DOM tree, scanning for license data
Walker walker = new Walker(base);
@@ -62,13 +60,13 @@ public class CCParseFilter implements Ht
// interpret results of walk
String licenseUrl = null;
String licenseLocation = null;
- if (walker.rdfLicense != null) { // 1st choice: subject in RDF
+ if (walker.rdfLicense != null) { // 1st choice: subject in RDF
licenseLocation = "rdf";
licenseUrl = walker.rdfLicense;
- } else if (walker.relLicense != null) { // 2nd: anchor w/ rel=license
+ } else if (walker.relLicense != null) { // 2nd: anchor w/ rel=license
licenseLocation = "rel";
licenseUrl = walker.relLicense.toString();
- } else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC license
+ } else if (walker.anchorLicense != null) { // 3rd: anchor w/ CC license
licenseLocation = "a";
licenseUrl = walker.anchorLicense.toString();
} else if (conf.getBoolean("creativecommons.exclude.unlicensed", false)) {
@@ -78,7 +76,8 @@ public class CCParseFilter implements Ht
// add license to metadata
if (licenseUrl != null) {
if (LOG.isInfoEnabled()) {
- LOG.info("CC: found "+licenseUrl+" in "+licenseLocation+" of "+base);
+ LOG.info("CC: found " + licenseUrl + " in " + licenseLocation
+ + " of " + base);
}
metadata.add(CreativeCommons.LICENSE_URL, licenseUrl);
metadata.add(CreativeCommons.LICENSE_LOCATION, licenseLocation);
@@ -86,36 +85,38 @@ public class CCParseFilter implements Ht
if (walker.workType != null) {
if (LOG.isInfoEnabled()) {
- LOG.info("CC: found "+walker.workType+" in "+base);
+ LOG.info("CC: found " + walker.workType + " in " + base);
}
metadata.add(CreativeCommons.WORK_TYPE, walker.workType);
}
}
- /** Scan the document looking for RDF in comments and license elements.*/
+ /** Scan the document looking for RDF in comments and license elements. */
private void walk(Node node) {
-
+
// check element nodes for license URL
if (node instanceof Element) {
- findLicenseUrl((Element)node);
+ findLicenseUrl((Element) node);
}
// check comment nodes for license RDF
if (node instanceof Comment) {
- findRdf(((Comment)node).getData());
+ findRdf(((Comment) node).getData());
}
// recursively walk child nodes
NodeList children = node.getChildNodes();
- for (int i = 0; children != null && i < children.getLength(); i++ ) {
+ for (int i = 0; children != null && i < children.getLength(); i++) {
walk(children.item(i));
}
}
- /** Extract license url from element, if any. Thse are the href attribute
- * of anchor elements with rel="license". These must also point to
- * http://creativecommons.org/licenses/. */
+ /**
+ * Extract license url from element, if any. Thse are the href attribute of
+ * anchor elements with rel="license". These must also point to
+ * http://creativecommons.org/licenses/.
+ */
private void findLicenseUrl(Element element) {
// only look in Anchor elements
if (!"a".equalsIgnoreCase(element.getTagName()))
@@ -125,54 +126,52 @@ public class CCParseFilter implements Ht
String href = element.getAttribute("href");
if (href == null)
return;
-
+
try {
- URL url = new URL(base, href); // resolve the url
+ URL url = new URL(base, href); // resolve the url
// check that it's a CC license URL
- if ("http".equalsIgnoreCase(url.getProtocol()) &&
- "creativecommons.org".equalsIgnoreCase(url.getHost()) &&
- url.getPath() != null &&
- url.getPath().startsWith("/licenses/") &&
- url.getPath().length() > "/licenses/".length()) {
+ if ("http".equalsIgnoreCase(url.getProtocol())
+ && "creativecommons.org".equalsIgnoreCase(url.getHost())
+ && url.getPath() != null && url.getPath().startsWith("/licenses/")
+ && url.getPath().length() > "/licenses/".length()) {
// check rel="license"
String rel = element.getAttribute("rel");
if (rel != null && "license".equals(rel) && this.relLicense == null) {
- this.relLicense = url; // found rel license
+ this.relLicense = url; // found rel license
} else if (this.anchorLicense == null) {
- this.anchorLicense = url; // found anchor license
+ this.anchorLicense = url; // found anchor license
}
}
- } catch (MalformedURLException e) { // ignore malformed urls
+ } catch (MalformedURLException e) { // ignore malformed urls
}
}
- /** Configure a namespace aware XML parser. */
- private static final DocumentBuilderFactory FACTORY
- = DocumentBuilderFactory.newInstance();
+ /** Configure a namespace aware XML parser. */
+ private static final DocumentBuilderFactory FACTORY = DocumentBuilderFactory
+ .newInstance();
static {
FACTORY.setNamespaceAware(true);
}
/** Creative Commons' namespace URI. */
private static final String CC_NS = "http://web.resource.org/cc/";
-
+
/** Dublin Core namespace URI. */
private static final String DC_NS = "http://purl.org/dc/elements/1.1/";
-
+
/** RDF syntax namespace URI. */
- private static final String RDF_NS
- = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
+ private static final String RDF_NS = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
private void findRdf(String comment) {
// first check for likely RDF in comment
int rdfPosition = comment.indexOf("RDF");
if (rdfPosition < 0)
- return; // no RDF, abort
+ return; // no RDF, abort
int nsPosition = comment.indexOf(CC_NS);
if (nsPosition < 0)
- return; // no RDF, abort
+ return; // no RDF, abort
// try to parse the XML
Document doc;
@@ -181,28 +180,30 @@ public class CCParseFilter implements Ht
doc = parser.parse(new InputSource(new StringReader(comment)));
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
- LOG.warn("CC: Failed to parse RDF in "+base+": "+e);
+ LOG.warn("CC: Failed to parse RDF in " + base + ": " + e);
}
- //e.printStackTrace();
+ // e.printStackTrace();
return;
}
// check that root is rdf:RDF
NodeList roots = doc.getElementsByTagNameNS(RDF_NS, "RDF");
if (roots.getLength() != 1) {
- if (LOG.isWarnEnabled()) { LOG.warn("CC: No RDF root in "+base); }
+ if (LOG.isWarnEnabled()) {
+ LOG.warn("CC: No RDF root in " + base);
+ }
return;
}
- Element rdf = (Element)roots.item(0);
+ Element rdf = (Element) roots.item(0);
// get cc:License nodes inside rdf:RDF
NodeList licenses = rdf.getElementsByTagNameNS(CC_NS, "License");
for (int i = 0; i < licenses.getLength(); i++) {
- Element l = (Element)licenses.item(i);
+ Element l = (Element) licenses.item(i);
// license is rdf:about= attribute from cc:License
- this.rdfLicense = l.getAttributeNodeNS(RDF_NS,"about").getValue();
+ this.rdfLicense = l.getAttributeNodeNS(RDF_NS, "about").getValue();
// walk predicates of cc:License
NodeList predicates = l.getChildNodes();
@@ -210,17 +211,17 @@ public class CCParseFilter implements Ht
Node predicateNode = predicates.item(j);
if (!(predicateNode instanceof Element))
continue;
- Element predicateElement = (Element)predicateNode;
+ Element predicateElement = (Element) predicateNode;
// extract predicates of cc:xxx predicates
if (!CC_NS.equals(predicateElement.getNamespaceURI())) {
continue;
}
-
+
// add object and predicate to metadata
// metadata.put(object, predicate);
// if (LOG.isInfoEnabled()) {
- // LOG.info("CC: found: "+predicate+"="+object);
+ // LOG.info("CC: found: "+predicate+"="+object);
// }
}
}
@@ -230,10 +231,11 @@ public class CCParseFilter implements Ht
for (int i = 0; i < works.getLength(); i++) {
// get dc:type nodes from cc:Work
NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");
-
+
for (int j = 0; j < types.getLength(); j++) {
- Element type = (Element)types.item(j);
- String workUri = type.getAttributeNodeNS(RDF_NS, "resource").getValue();
+ Element type = (Element) types.item(j);
+ String workUri = type.getAttributeNodeNS(RDF_NS, "resource")
+ .getValue();
this.workType = WORK_TYPE_NAMES.get(workUri);
}
}
@@ -246,16 +248,20 @@ public class CCParseFilter implements Ht
WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image");
WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Sound", "audio");
WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Text", "text");
- WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive", "interactive");
+ WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Interactive",
+ "interactive");
WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Software", "software");
WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/Image", "image");
}
private Configuration conf;
- /** Adds metadata or otherwise modifies a parse of an HTML document, given
- * the DOM tree of a page. */
- public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
+ /**
+ * Adds metadata or otherwise modifies a parse of an HTML document, given the
+ * DOM tree of a page.
+ */
+ public ParseResult filter(Content content, ParseResult parseResult,
+ HTMLMetaTags metaTags, DocumentFragment doc) {
// get parse obj
Parse parse = parseResult.get(content.getUrl());
@@ -266,9 +272,8 @@ public class CCParseFilter implements Ht
base = new URL(content.getBaseUrl());
} catch (MalformedURLException e) {
Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf());
- parseResult.put(content.getUrl(),
- new ParseText(emptyParse.getText()),
- emptyParse.getData());
+ parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()),
+ emptyParse.getData());
return parseResult;
}
@@ -277,9 +282,8 @@ public class CCParseFilter implements Ht
Walker.walk(doc, base, parse.getData().getParseMeta(), getConf());
} catch (ParseException e) {
Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf());
- parseResult.put(content.getUrl(),
- new ParseText(emptyParse.getText()),
- emptyParse.getData());
+ parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()),
+ emptyParse.getData());
return parseResult;
}
Modified: nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original)
+++ nutch/trunk/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Thu Jan 29 05:38:59 2015
@@ -30,30 +30,28 @@ import java.io.*;
public class TestCCParseFilter {
- private static final File testDir =
- new File(System.getProperty("test.input"));
+ private static final File testDir = new File(System.getProperty("test.input"));
@Test
public void testPages() throws Exception {
pageTest(new File(testDir, "anchor.html"), "http://foo.com/",
- "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
+ "http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);
// Tika returns <a> whereas parse-html returns <rel>
// check later
pageTest(new File(testDir, "rel.html"), "http://foo.com/",
- "http://creativecommons.org/licenses/by-nc/2.0", "rel", null);
+ "http://creativecommons.org/licenses/by-nc/2.0", "rel", null);
// Tika returns <a> whereas parse-html returns <rdf>
// check later
pageTest(new File(testDir, "rdf.html"), "http://foo.com/",
- "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text");
+ "http://creativecommons.org/licenses/by-nc/1.0", "rdf", "text");
}
- public void pageTest(File file, String url,
- String license, String location, String type)
- throws Exception {
+ public void pageTest(File file, String url, String license, String location,
+ String type) throws Exception {
String contentType = "text/html";
InputStream in = new FileInputStream(file);
- ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
+ ByteArrayOutputStream out = new ByteArrayOutputStream((int) file.length());
byte[] buffer = new byte[1024];
int i;
while ((i = in.read(buffer)) != -1) {
@@ -63,14 +61,13 @@ public class TestCCParseFilter {
byte[] bytes = out.toByteArray();
Configuration conf = NutchConfiguration.create();
- Content content =
- new Content(url, url, bytes, contentType, new Metadata(), conf);
- Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
-
+ Content content = new Content(url, url, bytes, contentType, new Metadata(),
+ conf);
+ Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+
Metadata metadata = parse.getData().getParseMeta();
Assert.assertEquals(license, metadata.get("License-Url"));
Assert.assertEquals(location, metadata.get("License-Location"));
Assert.assertEquals(type, metadata.get("Work-Type"));
}
}
-
Modified: nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java Thu Jan 29 05:38:59 2015
@@ -38,78 +38,77 @@ import org.apache.nutch.parse.ParseData;
* @author mattmann
* @since NUTCH-444
*
- * An {@link IndexingFilter} implementation to pull out the
- * relevant extracted {@link Metadata} fields from the RSS feeds
- * and into the index.
- *
+ * An {@link IndexingFilter} implementation to pull out the relevant
+ * extracted {@link Metadata} fields from the RSS feeds and into the
+ * index.
+ *
*/
public class FeedIndexingFilter implements IndexingFilter {
-
+
public static final String dateFormatStr = "yyyyMMddHHmm";
-
+
private Configuration conf;
-
+
private final static String PUBLISHED_DATE = "publishedDate";
-
+
private final static String UPDATED_DATE = "updatedDate";
-
+
/**
* Extracts out the relevant fields:
*
* <ul>
- * <li>FEED_AUTHOR</li>
- * <li>FEED_TAGS</li>
- * <li>FEED_PUBLISHED</li>
- * <li>FEED_UPDATED</li>
- * <li>FEED</li>
+ * <li>FEED_AUTHOR</li>
+ * <li>FEED_TAGS</li>
+ * <li>FEED_PUBLISHED</li>
+ * <li>FEED_UPDATED</li>
+ * <li>FEED</li>
* </ul>
*
- * And sends them to the {@link Indexer} for indexing within the Nutch
- * index.
- *
+ * And sends them to the {@link Indexer} for indexing within the Nutch index.
+ *
*/
- public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
- Inlinks inlinks) throws IndexingException {
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
ParseData parseData = parse.getData();
Metadata parseMeta = parseData.getParseMeta();
-
+
String[] authors = parseMeta.getValues(Feed.FEED_AUTHOR);
String[] tags = parseMeta.getValues(Feed.FEED_TAGS);
String published = parseMeta.get(Feed.FEED_PUBLISHED);
String updated = parseMeta.get(Feed.FEED_UPDATED);
String feed = parseMeta.get(Feed.FEED);
-
+
if (authors != null) {
for (String author : authors) {
doc.add(Feed.FEED_AUTHOR, author);
}
}
-
+
if (tags != null) {
for (String tag : tags) {
doc.add(Feed.FEED_TAGS, tag);
}
}
-
+
if (feed != null)
doc.add(Feed.FEED, feed);
-
+
if (published != null) {
Date date = new Date(Long.parseLong(published));
doc.add(PUBLISHED_DATE, date);
}
-
+
if (updated != null) {
Date date = new Date(Long.parseLong(updated));
doc.add(UPDATED_DATE, date);
}
-
+
return doc;
}
/**
- * @return the {@link Configuration} object used to configure
- * this {@link IndexingFilter}.
+ * @return the {@link Configuration} object used to configure this
+ * {@link IndexingFilter}.
*/
public Configuration getConf() {
return conf;
@@ -119,8 +118,9 @@ public class FeedIndexingFilter implemen
* Sets the {@link Configuration} object used to configure this
* {@link IndexingFilter}.
*
- * @param conf The {@link Configuration} object used to configure
- * this {@link IndexingFilter}.
+ * @param conf
+ * The {@link Configuration} object used to configure this
+ * {@link IndexingFilter}.
*/
public void setConf(Configuration conf) {
this.conf = conf;
Modified: nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java (original)
+++ nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/package-info.java Thu Jan 29 05:38:59 2015
@@ -19,3 +19,4 @@
* Indexing filter to index meta data from RSS feeds.
*/
package org.apache.nutch.indexer.feed;
+
Modified: nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java (original)
+++ nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java Thu Jan 29 05:38:59 2015
@@ -66,10 +66,10 @@ import com.sun.syndication.io.SyndFeedIn
* @author mattmann
* @since NUTCH-444
*
- * <p>
- * A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced links
- * and content present in the feed.
- * </p>
+ * <p>
+ * A new RSS/ATOM Feed{@link Parser} that rapidly parses all referenced
+ * links and content present in the feed.
+ * </p>
*
*/
public class FeedParser implements Parser {
@@ -99,8 +99,8 @@ public class FeedParser implements Parse
* A {@link Content} object representing the feed that is being
* parsed by this {@link Parser}.
*
- * @return A {@link ParseResult} containing all {@link Parse}d feeds that
- * were present in the feed file that this {@link Parser} dealt with.
+ * @return A {@link ParseResult} containing all {@link Parse}d feeds that were
+ * present in the feed file that this {@link Parser} dealt with.
*
*/
public ParseResult getParse(Content content) {
@@ -111,8 +111,8 @@ public class FeedParser implements Parse
detector.autoDetectClues(content, true);
String encoding = detector.guessEncoding(content, defaultEncoding);
try {
- InputSource input = new InputSource(new ByteArrayInputStream(content
- .getContent()));
+ InputSource input = new InputSource(new ByteArrayInputStream(
+ content.getContent()));
input.setEncoding(encoding);
SyndFeedInput feedInput = new SyndFeedInput();
feed = feedInput.build(input);
@@ -134,8 +134,8 @@ public class FeedParser implements Parse
}
List<?> entries = feed.getEntries();
- for(Object entry: entries) {
- addToMap(parseResult, feed, feedLink, (SyndEntry)entry, content);
+ for (Object entry : entries) {
+ addToMap(parseResult, feed, feedLink, (SyndEntry) entry, content);
}
String feedDesc = stripTags(feed.getDescriptionEx());
@@ -170,8 +170,8 @@ public class FeedParser implements Parse
this.parserFactory = new ParserFactory(conf);
this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
this.filters = new URLFilters(conf);
- this.defaultEncoding =
- conf.get("parser.character.encoding.default", "windows-1252");
+ this.defaultEncoding = conf.get("parser.character.encoding.default",
+ "windows-1252");
}
/**
@@ -255,8 +255,8 @@ public class FeedParser implements Parse
if (text == null) {
List<?> contents = entry.getContents();
StringBuilder buf = new StringBuilder();
- for (Object syndContent: contents) {
- buf.append(((SyndContent)syndContent).getValue());
+ for (Object syndContent : contents) {
+ buf.append(((SyndContent) syndContent).getValue());
}
text = buf.toString();
}
@@ -273,9 +273,9 @@ public class FeedParser implements Parse
ParseData data = parse.getData();
data.getContentMeta().remove(Response.CONTENT_TYPE);
mergeMetadata(data.getParseMeta(), parseMeta);
- parseResult.put(link, new ParseText(parse.getText()), new ParseData(
- ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), data
- .getContentMeta(), data.getParseMeta()));
+ parseResult.put(link, new ParseText(parse.getText()),
+ new ParseData(ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(),
+ data.getContentMeta(), data.getParseMeta()));
} else {
contentMeta.remove(Response.CONTENT_TYPE);
parseResult.put(link, new ParseText(text), new ParseData(
@@ -323,7 +323,7 @@ public class FeedParser implements Parse
}
}
- for (Object i: categories) {
+ for (Object i : categories) {
parseMeta.add(Feed.FEED_TAGS, ((SyndCategory) i).getName());
}
Modified: nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java (original)
+++ nutch/trunk/src/plugin/feed/src/java/org/apache/nutch/parse/feed/package-info.java Thu Jan 29 05:38:59 2015
@@ -19,3 +19,4 @@
* Parse RSS feeds.
*/
package org.apache.nutch.parse.feed;
+
Modified: nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java (original)
+++ nutch/trunk/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java Thu Jan 29 05:38:59 2015
@@ -43,7 +43,7 @@ import org.apache.nutch.util.NutchConfig
*
* @author mattmann
*
- * Test Suite for the {@link FeedParser}.
+ * Test Suite for the {@link FeedParser}.
*
*/
public class TestFeedParser {
@@ -96,18 +96,17 @@ public class TestFeedParser {
Assert.assertEquals(3, parseResult.size());
- boolean hasLink1 = false, hasLink2 = false, hasLink3=false;
+ boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j
.hasNext();) {
Map.Entry<Text, Parse> entry = j.next();
- if (entry.getKey().toString().equals(
- "http://www-scf.usc.edu/~mattmann/")) {
+ if (entry.getKey().toString()
+ .equals("http://www-scf.usc.edu/~mattmann/")) {
hasLink1 = true;
} else if (entry.getKey().toString().equals("http://www.nutch.org/")) {
hasLink2 = true;
- }
- else if(entry.getKey().toString().equals(urlString)){
+ } else if (entry.getKey().toString().equals(urlString)) {
hasLink3 = true;
}
Modified: nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java (original)
+++ nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/HeadingsParseFilter.java Thu Jan 29 05:38:59 2015
@@ -38,15 +38,16 @@ public class HeadingsParseFilter impleme
* Pattern used to strip surpluss whitespace
*/
protected static Pattern whitespacePattern = Pattern.compile("\\s+");
-
+
private Configuration conf;
private String[] headings;
private boolean multiValued = false;
- public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
+ public ParseResult filter(Content content, ParseResult parseResult,
+ HTMLMetaTags metaTags, DocumentFragment doc) {
Parse parse = parseResult.get(content.getUrl());
- for (int i = 0 ; headings != null && i < headings.length ; i++ ) {
+ for (int i = 0; headings != null && i < headings.length; i++) {
List<String> discoveredHeadings = getElement(doc, headings[i]);
if (discoveredHeadings.size() > 0) {
@@ -89,7 +90,7 @@ public class HeadingsParseFilter impleme
if (currentNode.getNodeType() == Node.ELEMENT_NODE) {
if (element.equalsIgnoreCase(currentNode.getNodeName())) {
headings.add(getNodeValue(currentNode));
-
+
// Check for multiValued here, if disabled we don't need
// to discover more headings.
if (!multiValued) {
Modified: nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java (original)
+++ nutch/trunk/src/plugin/headings/src/java/org/apache/nutch/parse/headings/package-info.java Thu Jan 29 05:38:59 2015
@@ -19,3 +19,4 @@
* Parse filter to extract headings (h1, h2, etc.) from DOM parse tree.
*/
package org.apache.nutch.parse.headings;
+
Modified: nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java Thu Jan 29 05:38:59 2015
@@ -30,13 +30,15 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
- * Indexing filter that offers an option to either index all inbound anchor text for
- * a document or deduplicate anchors. Deduplication does have it's con's,
+ * Indexing filter that offers an option to either index all inbound anchor text
+ * for a document or deduplicate anchors. Deduplication does have it's con's,
+ *
* @see {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
*/
public class AnchorIndexingFilter implements IndexingFilter {
- public static final Logger LOG = LoggerFactory.getLogger(AnchorIndexingFilter.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(AnchorIndexingFilter.class);
private Configuration conf;
private boolean deduplicate = false;
@@ -49,6 +51,7 @@ public class AnchorIndexingFilter implem
deduplicate = conf.getBoolean("anchorIndexingFilter.deduplicate", false);
LOG.info("Anchor deduplication is: " + (deduplicate ? "on" : "off"));
}
+
/**
* Get the {@link Configuration} object
*/
@@ -57,28 +60,33 @@ public class AnchorIndexingFilter implem
}
/**
- * The {@link AnchorIndexingFilter} filter object which supports boolean
- * configuration settings for the deduplication of anchors.
- * See {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
- *
- * @param doc The {@link NutchDocument} object
- * @param parse The relevant {@link Parse} object passing through the filter
- * @param url URL to be filtered for anchor text
- * @param datum The {@link CrawlDatum} entry
- * @param inlinks The {@link Inlinks} containing anchor text
+ * The {@link AnchorIndexingFilter} filter object which supports boolean
+ * configuration settings for the deduplication of anchors. See
+ * {@code anchorIndexingFilter.deduplicate} in nutch-default.xml.
+ *
+ * @param doc
+ * The {@link NutchDocument} object
+ * @param parse
+ * The relevant {@link Parse} object passing through the filter
+ * @param url
+ * URL to be filtered for anchor text
+ * @param datum
+ * The {@link CrawlDatum} entry
+ * @param inlinks
+ * The {@link Inlinks} containing anchor text
* @return filtered NutchDocument
*/
- public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum,
- Inlinks inlinks) throws IndexingException {
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
- String[] anchors = (inlinks != null ? inlinks.getAnchors()
- : new String[0]);
+ String[] anchors = (inlinks != null ? inlinks.getAnchors() : new String[0]);
HashSet<String> set = null;
for (int i = 0; i < anchors.length; i++) {
if (deduplicate) {
- if (set == null) set = new HashSet<String>();
+ if (set == null)
+ set = new HashSet<String>();
String lcAnchor = anchors[i].toLowerCase();
// Check if already processed the current anchor
Modified: nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java Thu Jan 29 05:38:59 2015
@@ -29,12 +29,11 @@ import org.junit.Assert;
import org.junit.Test;
/**
- * JUnit test case which tests
- * 1. that anchor text is obtained
- * 2. that anchor deduplication functionality is working
+ * JUnit test case which tests 1. that anchor text is obtained 2. that anchor
+ * deduplication functionality is working
*
* @author lewismc
- *
+ *
*/
public class TestAnchorIndexingFilter {
@@ -52,14 +51,17 @@ public class TestAnchorIndexingFilter {
inlinks.add(new Inlink("http://test2.com/", "text2"));
inlinks.add(new Inlink("http://test3.com/", "text2"));
try {
- filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks);
- } catch(Exception e){
+ filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
+ new CrawlDatum(), inlinks);
+ } catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
Assert.assertNotNull(doc);
- Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
- Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size());
+ Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames()
+ .contains("anchor"));
+ Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor")
+ .getValues().size());
}
}
Modified: nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Thu Jan 29 05:38:59 2015
@@ -39,42 +39,48 @@ import java.util.Date;
import org.apache.hadoop.conf.Configuration;
-/**
- * Adds basic searchable fields to a document.
- * The fields added are : domain, host, url, content, title, cache, tstamp
- * domain is included depending on {@code indexer.add.domain} in nutch-default.xml.
- * title is truncated as per {@code indexer.max.title.length} in nutch-default.xml.
- * (As per NUTCH-1004, a zero-length title is not added)
- * content is truncated as per {@code indexer.max.content.length} in nutch-default.xml.
+/**
+ * Adds basic searchable fields to a document. The fields added are : domain,
+ * host, url, content, title, cache, tstamp domain is included depending on
+ * {@code indexer.add.domain} in nutch-default.xml. title is truncated as per
+ * {@code indexer.max.title.length} in nutch-default.xml. (As per NUTCH-1004, a
+ * zero-length title is not added) content is truncated as per
+ * {@code indexer.max.content.length} in nutch-default.xml.
*/
public class BasicIndexingFilter implements IndexingFilter {
- public static final Logger LOG = LoggerFactory.getLogger(BasicIndexingFilter.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(BasicIndexingFilter.class);
private int MAX_TITLE_LENGTH;
private int MAX_CONTENT_LENGTH;
private boolean addDomain = false;
private Configuration conf;
- /**
- * The {@link BasicIndexingFilter} filter object which supports few
- * configuration settings for adding basic searchable fields.
- * See {@code indexer.add.domain}, {@code indexer.max.title.length},
- * {@code indexer.max.content.length} in nutch-default.xml.
- *
- * @param doc The {@link NutchDocument} object
- * @param parse The relevant {@link Parse} object passing through the filter
- * @param url URL to be filtered for anchor text
- * @param datum The {@link CrawlDatum} entry
- * @param inlinks The {@link Inlinks} containing anchor text
- * @return filtered NutchDocument
- */
- public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
- throws IndexingException {
+ /**
+ * The {@link BasicIndexingFilter} filter object which supports few
+ * configuration settings for adding basic searchable fields. See
+ * {@code indexer.add.domain}, {@code indexer.max.title.length},
+ * {@code indexer.max.content.length} in nutch-default.xml.
+ *
+ * @param doc
+ * The {@link NutchDocument} object
+ * @param parse
+ * The relevant {@link Parse} object passing through the filter
+ * @param url
+ * URL to be filtered for anchor text
+ * @param datum
+ * The {@link CrawlDatum} entry
+ * @param inlinks
+ * The {@link Inlinks} containing anchor text
+ * @return filtered NutchDocument
+ */
+ public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+ CrawlDatum datum, Inlinks inlinks) throws IndexingException {
Text reprUrl = (Text) datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
String reprUrlString = reprUrl != null ? reprUrl.toString() : null;
String urlString = url.toString();
-
+
String host = null;
try {
URL u;
@@ -83,11 +89,11 @@ public class BasicIndexingFilter impleme
} else {
u = new URL(urlString);
}
-
+
if (addDomain) {
doc.add("domain", URLUtil.getDomainName(u));
}
-
+
host = u.getHost();
} catch (MalformedURLException e) {
throw new IndexingException(e);
@@ -108,7 +114,10 @@ public class BasicIndexingFilter impleme
// title
String title = parse.getData().getTitle();
- if (MAX_TITLE_LENGTH > -1 && title.length() > MAX_TITLE_LENGTH) { // truncate title if needed
+ if (MAX_TITLE_LENGTH > -1 && title.length() > MAX_TITLE_LENGTH) { // truncate
+ // title
+ // if
+ // needed
title = title.substring(0, MAX_TITLE_LENGTH);
}
Modified: nutch/trunk/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java Thu Jan 29 05:38:59 2015
@@ -34,20 +34,20 @@ import org.junit.Test;
import java.util.Date;
/**
- * JUnit test case which tests
- * 1. that basic searchable fields are added to a document
- * 2. that domain is added as per {@code indexer.add.domain} in nutch-default.xml.
- * 3. that title is truncated as per {@code indexer.max.title.length} in nutch-default.xml.
- * 4. that content is truncated as per {@code indexer.max.content.length} in nutch-default.xml.
+ * JUnit test case which tests 1. that basic searchable fields are added to a
+ * document 2. that domain is added as per {@code indexer.add.domain} in
+ * nutch-default.xml. 3. that title is truncated as per
+ * {@code indexer.max.title.length} in nutch-default.xml. 4. that content is
+ * truncated as per {@code indexer.max.content.length} in nutch-default.xml.
*
* @author tejasp
- *
+ *
*/
public class TestBasicIndexingFilter {
@Test
- public void testBasicIndexingFilter() throws Exception {
+ public void testBasicIndexingFilter() throws Exception {
Configuration conf = NutchConfiguration.create();
conf.setInt("indexer.max.title.length", 10);
conf.setBoolean("indexer.add.domain", true);
@@ -63,8 +63,10 @@ public class TestBasicIndexingFilter {
Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
Metadata metaData = new Metadata();
metaData.add("Language", "en/us");
- ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
- ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);
+ ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
+ outlinks, metaData);
+ ParseImpl parse = new ParseImpl(
+ "this is a sample foo bar page. hope you enjoy it.", parseData);
CrawlDatum crawlDatum = new CrawlDatum();
crawlDatum.setFetchTime(100L);
@@ -72,18 +74,26 @@ public class TestBasicIndexingFilter {
Inlinks inlinks = new Inlinks();
try {
- filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
- } catch(Exception e){
+ filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"),
+ crawlDatum, inlinks);
+ } catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
Assert.assertNotNull(doc);
- Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0));
- Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0));
- Assert.assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0));
- Assert.assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html",
- doc.getField("url").getValues().get(0));
- Assert.assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0));
- Assert.assertEquals("test fetch time", new Date(100L), (Date)doc.getField("tstamp").getValues().get(0));
+ Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc
+ .getField("title").getValues().get(0));
+ Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc
+ .getField("domain").getValues().get(0));
+ Assert.assertEquals("test host, expect \"nutch.apache.org\"",
+ "nutch.apache.org", doc.getField("host").getValues().get(0));
+ Assert.assertEquals(
+ "test url, expect \"http://nutch.apache.org/index.html\"",
+ "http://nutch.apache.org/index.html", doc.getField("url").getValues()
+ .get(0));
+ Assert.assertEquals("test content", "this is a sample foo",
+ doc.getField("content").getValues().get(0));
+ Assert.assertEquals("test fetch time", new Date(100L),
+ (Date) doc.getField("tstamp").getValues().get(0));
}
}
Modified: nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java (original)
+++ nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPDocumentCreator.java Thu Jan 29 05:38:59 2015
@@ -41,13 +41,16 @@ import com.maxmind.geoip2.record.Subdivi
import com.maxmind.geoip2.record.Traits;
/**
- * <p>Simple utility class which enables efficient, structured
- * {@link org.apache.nutch.indexer.NutchDocument} building based on input
- * from {@link GeoIPIndexingFilter}, where configuration is also read.</p>
- * <p>Based on the nature of the input, this class wraps factory type
- * implementations for populating {@link org.apache.nutch.indexer.NutchDocument}'s
- * with the correct {@link org.apache.nutch.indexer.NutchField} information.
- *
+ * <p>
+ * Simple utility class which enables efficient, structured
+ * {@link org.apache.nutch.indexer.NutchDocument} building based on input from
+ * {@link GeoIPIndexingFilter}, where configuration is also read.
+ * </p>
+ * <p>
+ * Based on the nature of the input, this class wraps factory type
+ * implementations for populating {@link org.apache.nutch.indexer.NutchDocument}
+ * 's with the correct {@link org.apache.nutch.indexer.NutchField} information.
+ *
*/
public class GeoIPDocumentCreator {
@@ -58,13 +61,15 @@ public class GeoIPDocumentCreator {
}
public static NutchDocument createDocFromInsightsService(String serverIp,
- NutchDocument doc, WebServiceClient client) throws UnknownHostException, IOException, GeoIp2Exception {
+ NutchDocument doc, WebServiceClient client) throws UnknownHostException,
+ IOException, GeoIp2Exception {
doc.add("ip", serverIp);
- InsightsResponse response = client.insights(InetAddress.getByName(serverIp));
- //CityResponse response = client.city(InetAddress.getByName(serverIp));
-
+ InsightsResponse response = client
+ .insights(InetAddress.getByName(serverIp));
+ // CityResponse response = client.city(InetAddress.getByName(serverIp));
+
City city = response.getCity();
- doc.add("cityName", city.getName()); // 'Minneapolis'
+ doc.add("cityName", city.getName()); // 'Minneapolis'
doc.add("cityConfidence", city.getConfidence()); // 50
doc.add("cityGeoNameId", city.getGeoNameId());
@@ -74,31 +79,32 @@ public class GeoIPDocumentCreator {
doc.add("continentName", continent.getName());
Country country = response.getCountry();
- doc.add("countryIsoCode", country.getIsoCode()); // 'US'
- doc.add("countryName", country.getName()); // 'United States'
- doc.add("countryConfidence", country.getConfidence()); // 99
+ doc.add("countryIsoCode", country.getIsoCode()); // 'US'
+ doc.add("countryName", country.getName()); // 'United States'
+ doc.add("countryConfidence", country.getConfidence()); // 99
doc.add("countryGeoName", country.getGeoNameId());
Location location = response.getLocation();
- doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733, -93.2323
- doc.add("accRadius", location.getAccuracyRadius()); // 3
- doc.add("timeZone", location.getTimeZone()); // 'America/Chicago'
+ doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
+ // -93.2323
+ doc.add("accRadius", location.getAccuracyRadius()); // 3
+ doc.add("timeZone", location.getTimeZone()); // 'America/Chicago'
doc.add("metroCode", location.getMetroCode());
Postal postal = response.getPostal();
- doc.add("postalCode", postal.getCode()); // '55455'
+ doc.add("postalCode", postal.getCode()); // '55455'
doc.add("postalConfidence", postal.getConfidence()); // 40
RepresentedCountry rCountry = response.getRepresentedCountry();
doc.add("countryType", rCountry.getType());
Subdivision subdivision = response.getMostSpecificSubdivision();
- doc.add("subDivName", subdivision.getName()); // 'Minnesota'
- doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN'
+ doc.add("subDivName", subdivision.getName()); // 'Minnesota'
+ doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN'
doc.add("subDivConfidence", subdivision.getConfidence()); // 90
doc.add("subDivGeoNameId", subdivision.getGeoNameId());
- Traits traits = response.getTraits();
+ Traits traits = response.getTraits();
doc.add("autonSystemNum", traits.getAutonomousSystemNumber());
doc.add("autonSystemOrg", traits.getAutonomousSystemOrganization());
doc.add("domain", traits.getDomain());
@@ -112,20 +118,23 @@ public class GeoIPDocumentCreator {
@SuppressWarnings("unused")
public static NutchDocument createDocFromCityService(String serverIp,
- NutchDocument doc, WebServiceClient client) throws UnknownHostException, IOException, GeoIp2Exception {
+ NutchDocument doc, WebServiceClient client) throws UnknownHostException,
+ IOException, GeoIp2Exception {
CityResponse response = client.city(InetAddress.getByName(serverIp));
return doc;
}
@SuppressWarnings("unused")
public static NutchDocument createDocFromCountryService(String serverIp,
- NutchDocument doc, WebServiceClient client) throws UnknownHostException, IOException, GeoIp2Exception {
- CountryResponse response = client.country(InetAddress.getByName(serverIp));
+ NutchDocument doc, WebServiceClient client) throws UnknownHostException,
+ IOException, GeoIp2Exception {
+ CountryResponse response = client.country(InetAddress.getByName(serverIp));
return doc;
}
- public static NutchDocument createDocFromIspDb(String serverIp, NutchDocument doc,
- DatabaseReader reader) throws UnknownHostException, IOException, GeoIp2Exception {
+ public static NutchDocument createDocFromIspDb(String serverIp,
+ NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
+ IOException, GeoIp2Exception {
IspResponse response = reader.isp(InetAddress.getByName(serverIp));
doc.add("ip", serverIp);
doc.add("autonSystemNum", response.getAutonomousSystemNumber());
@@ -135,8 +144,9 @@ public class GeoIPDocumentCreator {
return doc;
}
- public static NutchDocument createDocFromDomainDb(String serverIp, NutchDocument doc,
- DatabaseReader reader) throws UnknownHostException, IOException, GeoIp2Exception {
+ public static NutchDocument createDocFromDomainDb(String serverIp,
+ NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
+ IOException, GeoIp2Exception {
DomainResponse response = reader.domain(InetAddress.getByName(serverIp));
doc.add("ip", serverIp);
doc.add("domain", response.getDomain());
@@ -144,20 +154,23 @@ public class GeoIPDocumentCreator {
}
public static NutchDocument createDocFromConnectionDb(String serverIp,
- NutchDocument doc, DatabaseReader reader) throws UnknownHostException, IOException, GeoIp2Exception {
- ConnectionTypeResponse response = reader.connectionType(InetAddress.getByName(serverIp));
+ NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
+ IOException, GeoIp2Exception {
+ ConnectionTypeResponse response = reader.connectionType(InetAddress
+ .getByName(serverIp));
doc.add("ip", serverIp);
doc.add("connType", response.getConnectionType().toString());
return doc;
}
- public static NutchDocument createDocFromCityDb(String serverIp, NutchDocument doc,
- DatabaseReader reader) throws UnknownHostException, IOException, GeoIp2Exception {
+ public static NutchDocument createDocFromCityDb(String serverIp,
+ NutchDocument doc, DatabaseReader reader) throws UnknownHostException,
+ IOException, GeoIp2Exception {
doc.add("ip", serverIp);
CityResponse response = reader.city(InetAddress.getByName(serverIp));
City city = response.getCity();
- doc.add("cityName", city.getName()); // 'Minneapolis'
+ doc.add("cityName", city.getName()); // 'Minneapolis'
doc.add("cityConfidence", city.getConfidence()); // 50
doc.add("cityGeoNameId", city.getGeoNameId());
@@ -167,27 +180,28 @@ public class GeoIPDocumentCreator {
doc.add("continentName", continent.getName());
Country country = response.getCountry();
- doc.add("countryIsoCode", country.getIsoCode()); // 'US'
- doc.add("countryName", country.getName()); // 'United States'
- doc.add("countryConfidence", country.getConfidence()); // 99
+ doc.add("countryIsoCode", country.getIsoCode()); // 'US'
+ doc.add("countryName", country.getName()); // 'United States'
+ doc.add("countryConfidence", country.getConfidence()); // 99
doc.add("countryGeoName", country.getGeoNameId());
Location location = response.getLocation();
- doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733, -93.2323
- doc.add("accRadius", location.getAccuracyRadius()); // 3
- doc.add("timeZone", location.getTimeZone()); // 'America/Chicago'
+ doc.add("latLon", location.getLatitude() + "," + location.getLongitude()); // 44.9733,
+ // -93.2323
+ doc.add("accRadius", location.getAccuracyRadius()); // 3
+ doc.add("timeZone", location.getTimeZone()); // 'America/Chicago'
doc.add("metroCode", location.getMetroCode());
Postal postal = response.getPostal();
- doc.add("postalCode", postal.getCode()); // '55455'
+ doc.add("postalCode", postal.getCode()); // '55455'
doc.add("postalConfidence", postal.getConfidence()); // 40
RepresentedCountry rCountry = response.getRepresentedCountry();
doc.add("countryType", rCountry.getType());
Subdivision subdivision = response.getMostSpecificSubdivision();
- doc.add("subDivName", subdivision.getName()); // 'Minnesota'
- doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN'
+ doc.add("subDivName", subdivision.getName()); // 'Minnesota'
+ doc.add("subDivIdoCode", subdivision.getIsoCode()); // 'MN'
doc.add("subDivConfidence", subdivision.getConfidence()); // 90
doc.add("subDivGeoNameId", subdivision.getGeoNameId());
return doc;
Modified: nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java (original)
+++ nutch/trunk/src/plugin/index-geoip/src/java/org/apache/nutch/indexer/geoip/GeoIPIndexingFilter.java Thu Jan 29 05:38:59 2015
@@ -34,16 +34,22 @@ import com.maxmind.geoip2.DatabaseReader
import com.maxmind.geoip2.WebServiceClient;
/**
- * <p>This plugin implements an indexing filter which takes
- * advantage of the
- * <a href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>.</p>
- * <p>The third party library distribution provides an API for the GeoIP2
- * <a href="http://dev.maxmind.com/geoip/geoip2/web-services">Precision web services</a>
- * and <a href="http://dev.maxmind.com/geoip/geoip2/downloadable">databases</a>.
- * The API also works with the free
- * <a href="http://dev.maxmind.com/geoip/geoip2/geolite2/">GeoLite2 databases</a>.</p>
- * <p>Depending on the service level agreement, you have with the GeoIP service provider,
- * the plugin can add a number of the following fields to the index data model:
+ * <p>
+ * This plugin implements an indexing filter which takes advantage of the <a
+ * href="https://github.com/maxmind/GeoIP2-java">GeoIP2-java API</a>.
+ * </p>
+ * <p>
+ * The third party library distribution provides an API for the GeoIP2 <a
+ * href="http://dev.maxmind.com/geoip/geoip2/web-services">Precision web
+ * services</a> and <a
+ * href="http://dev.maxmind.com/geoip/geoip2/downloadable">databases</a>. The
+ * API also works with the free <a
+ * href="http://dev.maxmind.com/geoip/geoip2/geolite2/">GeoLite2 databases</a>.
+ * </p>
+ * <p>
+ * Depending on the service level agreement, you have with the GeoIP service
+ * provider, the plugin can add a number of the following fields to the index
+ * data model:
* <ol>
* <li>Continent</li>
* <li>Country</li>
@@ -56,51 +62,59 @@ import com.maxmind.geoip2.WebServiceClie
* <li>Confidence Factors</li>
* <li>Radius</li>
* <li>User Type</li>
- * </ol></p>
+ * </ol>
+ * </p>
*
- * <p>Some of the services are documented at the
- * <a href="https://www.maxmind.com/en/geoip2-precision-services">GeoIP2 Precision Services</a>
- * webpage where more information can be obtained.</p>
+ * <p>
+ * Some of the services are documented at the <a
+ * href="https://www.maxmind.com/en/geoip2-precision-services">GeoIP2 Precision
+ * Services</a> webpage where more information can be obtained.
+ * </p>
+ *
+ * <p>
+ * You should also consult the following three properties in
+ * <code>nutch-site.xml</code>
+ * </p>
*
- * <p>You should also consult the following three properties in <code>nutch-site.xml</code></p>
* <pre>
- * {@code
- *<!-- index-geoip plugin properties -->
-<property>
- <name>index.geoip.usage</name>
- <value>insightsService</value>
- <description>
- A string representing the information source to be used for GeoIP information
- association. Either enter 'cityDatabase', 'connectionTypeDatabase',
- 'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the
- Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb,
- GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the Hadoop classpath
- and available at runtime. This can be achieved by adding it to $NUTCH_HOME/conf
- </description>
-</property>
-
-<property>
- <name>index.geoip.userid</name>
- <value></value>
- <description>
- The userId associated with the GeoIP2 Precision Services account.
- </description>
-</property>
-
-<property>
- <name>index.geoip.licensekey</name>
- <value></value>
- <description>
- The license key associated with the GeoIP2 Precision Services account.
- </description>
-</property>
-}
+ * {@code
+ * <!-- index-geoip plugin properties -->
+ * <property>
+ * <name>index.geoip.usage</name>
+ * <value>insightsService</value>
+ * <description>
+ * A string representing the information source to be used for GeoIP information
+ * association. Either enter 'cityDatabase', 'connectionTypeDatabase',
+ * 'domainDatabase', 'ispDatabase' or 'insightsService'. If you wish to use any one of the
+ * Database options, you should make one of GeoIP2-City.mmdb, GeoIP2-Connection-Type.mmdb,
+ * GeoIP2-Domain.mmdb or GeoIP2-ISP.mmdb files respectively available on the Hadoop classpath
+ * and available at runtime. This can be achieved by adding it to $NUTCH_HOME/conf
+ * </description>
+ * </property>
+ *
+ * <property>
+ * <name>index.geoip.userid</name>
+ * <value></value>
+ * <description>
+ * The userId associated with the GeoIP2 Precision Services account.
+ * </description>
+ * </property>
+ *
+ * <property>
+ * <name>index.geoip.licensekey</name>
+ * <value></value>
+ * <description>
+ * The license key associated with the GeoIP2 Precision Services account.
+ * </description>
+ * </property>
+ * }
* </pre>
*
*/
public class GeoIPIndexingFilter implements IndexingFilter {
- private static final Logger LOG = LoggerFactory.getLogger(GeoIPIndexingFilter.class);
+ private static final Logger LOG = LoggerFactory
+ .getLogger(GeoIPIndexingFilter.class);
private Configuration conf;
@@ -112,7 +126,7 @@ public class GeoIPIndexingFilter impleme
DatabaseReader reader = null;
- //private AbstractResponse response = null;
+ // private AbstractResponse response = null;
/**
* Default constructor for this plugin
@@ -145,7 +159,8 @@ public class GeoIPIndexingFilter impleme
}
} else if (use.equalsIgnoreCase("connectionTypeDatabase")) {
try {
- geoDb = new File(conf.getResource("GeoIP2-Connection-Type.mmdb").getFile());
+ geoDb = new File(conf.getResource("GeoIP2-Connection-Type.mmdb")
+ .getFile());
buildDb();
} catch (Exception e) {
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
@@ -165,8 +180,8 @@ public class GeoIPIndexingFilter impleme
LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
} else if (use.equalsIgnoreCase("insightsService")) {
- client = new WebServiceClient.Builder(
- conf.getInt("index.geoip.userid", 12345), conf.get("index.geoip.licensekey")).build();
+ client = new WebServiceClient.Builder(conf.getInt("index.geoip.userid",
+ 12345), conf.get("index.geoip.licensekey")).build();
}
usage = use;
}
@@ -181,7 +196,9 @@ public class GeoIPIndexingFilter impleme
/**
*
- * @see org.apache.nutch.indexer.IndexingFilter#filter(org.apache.nutch.indexer.NutchDocument, org.apache.nutch.parse.Parse, org.apache.hadoop.io.Text, org.apache.nutch.crawl.CrawlDatum, org.apache.nutch.crawl.Inlinks)
+ * @see org.apache.nutch.indexer.IndexingFilter#filter(org.apache.nutch.indexer.NutchDocument,
+ * org.apache.nutch.parse.Parse, org.apache.hadoop.io.Text,
+ * org.apache.nutch.crawl.CrawlDatum, org.apache.nutch.crawl.Inlinks)
*/
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
@@ -189,22 +206,28 @@ public class GeoIPIndexingFilter impleme
return addServerGeo(doc, parse.getData(), url.toString());
}
- private NutchDocument addServerGeo(NutchDocument doc, ParseData data, String url) {
+ private NutchDocument addServerGeo(NutchDocument doc, ParseData data,
+ String url) {
if (conf.getBoolean("store.ip.address", false) == true) {
try {
String serverIp = data.getContentMeta().get("_ip_");
if (serverIp != null) {
if (usage.equalsIgnoreCase("cityDatabase")) {
- doc = GeoIPDocumentCreator.createDocFromCityDb(serverIp, doc, reader);
+ doc = GeoIPDocumentCreator.createDocFromCityDb(serverIp, doc,
+ reader);
} else if (usage.equalsIgnoreCase("connectionTypeDatabase")) {
- doc = GeoIPDocumentCreator.createDocFromConnectionDb(serverIp, doc, reader);
+ doc = GeoIPDocumentCreator.createDocFromConnectionDb(serverIp, doc,
+ reader);
} else if (usage.equalsIgnoreCase("domainDatabase")) {
- doc = GeoIPDocumentCreator.createDocFromDomainDb(serverIp, doc, reader);
+ doc = GeoIPDocumentCreator.createDocFromDomainDb(serverIp, doc,
+ reader);
} else if (usage.equalsIgnoreCase("ispDatabase")) {
- doc = GeoIPDocumentCreator.createDocFromIspDb(serverIp, doc, reader);
+ doc = GeoIPDocumentCreator
+ .createDocFromIspDb(serverIp, doc, reader);
} else if (usage.equalsIgnoreCase("insightsService")) {
- doc = GeoIPDocumentCreator.createDocFromInsightsService(serverIp, doc, client);
+ doc = GeoIPDocumentCreator.createDocFromInsightsService(serverIp,
+ doc, client);
}
}
} catch (Exception e) {
Modified: nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java (original)
+++ nutch/trunk/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/package-info.java Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
* Metadata may come from CrawlDb, parse or content metadata.
*/
package org.apache.nutch.indexer.metadata;
+