You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/29 06:39:03 UTC
svn commit: r1655526 [6/26] - in /nutch/trunk: ./
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/
src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/pr...
Modified: nutch/trunk/src/java/org/apache/nutch/metadata/DublinCore.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/DublinCore.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/DublinCore.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/DublinCore.java Thu Jan 29 05:38:59 2015
@@ -16,149 +16,146 @@
*/
package org.apache.nutch.metadata;
-
/**
* A collection of Dublin Core metadata names.
- *
- * @see <a href="http://dublincore.org">dublincore.org</a>
- *
+ *
+ * @see <a href="http://dublincore.org">dublincore.org</a>
+ *
* @author Chris Mattmann
* @author Jérôme Charron
*/
public interface DublinCore {
-
-
+
/**
- * Typically, Format may include the media-type or dimensions of the
- * resource. Format may be used to determine the software, hardware or other
- * equipment needed to display or operate the resource. Examples of
- * dimensions include size and duration. Recommended best practice is to
- * select a value from a controlled vocabulary (for example, the list of
- * Internet Media Types [MIME] defining computer media formats).
+ * Typically, Format may include the media-type or dimensions of the resource.
+ * Format may be used to determine the software, hardware or other equipment
+ * needed to display or operate the resource. Examples of dimensions include
+ * size and duration. Recommended best practice is to select a value from a
+ * controlled vocabulary (for example, the list of Internet Media Types [MIME]
+ * defining computer media formats).
*/
public static final String FORMAT = "format";
-
+
/**
- * Recommended best practice is to identify the resource by means of a
- * string or number conforming to a formal identification system. Example
- * formal identification systems include the Uniform Resource Identifier
- * (URI) (including the Uniform Resource Locator (URL)), the Digital Object
+ * Recommended best practice is to identify the resource by means of a string
+ * or number conforming to a formal identification system. Example formal
+ * identification systems include the Uniform Resource Identifier (URI)
+ * (including the Uniform Resource Locator (URL)), the Digital Object
* Identifier (DOI) and the International Standard Book Number (ISBN).
*/
public static final String IDENTIFIER = "identifier";
-
+
/**
* Date on which the resource was changed.
*/
public static final String MODIFIED = "modified";
-
+
/**
* An entity responsible for making contributions to the content of the
- * resource. Examples of a Contributor include a person, an organisation, or
- * a service. Typically, the name of a Contributor should be used to
- * indicate the entity.
+ * resource. Examples of a Contributor include a person, an organisation, or a
+ * service. Typically, the name of a Contributor should be used to indicate
+ * the entity.
*/
public static final String CONTRIBUTOR = "contributor";
-
+
/**
- * The extent or scope of the content of the resource. Coverage will
- * typically include spatial location (a place name or geographic
- * coordinates), temporal period (a period label, date, or date range) or
- * jurisdiction (such as a named administrative entity). Recommended best
- * practice is to select a value from a controlled vocabulary (for example,
- * the Thesaurus of Geographic Names [TGN]) and that, where appropriate,
- * named places or time periods be used in preference to numeric identifiers
- * such as sets of coordinates or date ranges.
+ * The extent or scope of the content of the resource. Coverage will typically
+ * include spatial location (a place name or geographic coordinates), temporal
+ * period (a period label, date, or date range) or jurisdiction (such as a
+ * named administrative entity). Recommended best practice is to select a
+ * value from a controlled vocabulary (for example, the Thesaurus of
+ * Geographic Names [TGN]) and that, where appropriate, named places or time
+ * periods be used in preference to numeric identifiers such as sets of
+ * coordinates or date ranges.
*/
public static final String COVERAGE = "coverage";
-
+
/**
* An entity primarily responsible for making the content of the resource.
* Examples of a Creator include a person, an organisation, or a service.
* Typically, the name of a Creator should be used to indicate the entity.
*/
public static final String CREATOR = "creator";
-
+
/**
* A date associated with an event in the life cycle of the resource.
- * Typically, Date will be associated with the creation or availability of
- * the resource. Recommended best practice for encoding the date value is
- * defined in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD
- * format.
+ * Typically, Date will be associated with the creation or availability of the
+ * resource. Recommended best practice for encoding the date value is defined
+ * in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD format.
*/
public static final String DATE = "date";
-
+
/**
* An account of the content of the resource. Description may include but is
* not limited to: an abstract, table of contents, reference to a graphical
* representation of content or a free-text account of the content.
*/
public static final String DESCRIPTION = "description";
-
+
/**
* A language of the intellectual content of the resource. Recommended best
* practice is to use RFC 3066 [RFC3066], which, in conjunction with ISO 639
- * [ISO639], defines two- and three-letter primary language tags with
- * optional subtags. Examples include "en" or "eng" for English, "akk" for
- * Akkadian, and "en-GB" for English used in the United Kingdom.
+ * [ISO639], defines two- and three-letter primary language tags with optional
+ * subtags. Examples include "en" or "eng" for English, "akk" for Akkadian,
+ * and "en-GB" for English used in the United Kingdom.
*/
public static final String LANGUAGE = "language";
-
+
/**
* An entity responsible for making the resource available. Examples of a
* Publisher include a person, an organisation, or a service. Typically, the
* name of a Publisher should be used to indicate the entity.
*/
public static final String PUBLISHER = "publisher";
-
+
/**
* A reference to a related resource. Recommended best practice is to
* reference the resource by means of a string or number conforming to a
* formal identification system.
*/
public static final String RELATION = "relation";
-
+
/**
- * Information about rights held in and over the resource. Typically, a
- * Rights element will contain a rights management statement for the
- * resource, or reference a service providing such information. Rights
- * information often encompasses Intellectual Property Rights (IPR),
- * Copyright, and various Property Rights. If the Rights element is absent,
- * no assumptions can be made about the status of these and other rights
- * with respect to the resource.
+ * Information about rights held in and over the resource. Typically, a Rights
+ * element will contain a rights management statement for the resource, or
+ * reference a service providing such information. Rights information often
+ * encompasses Intellectual Property Rights (IPR), Copyright, and various
+ * Property Rights. If the Rights element is absent, no assumptions can be
+ * made about the status of these and other rights with respect to the
+ * resource.
*/
public static final String RIGHTS = "rights";
-
+
/**
* A reference to a resource from which the present resource is derived. The
* present resource may be derived from the Source resource in whole or in
- * part. Recommended best practice is to reference the resource by means of
- * a string or number conforming to a formal identification system.
+ * part. Recommended best practice is to reference the resource by means of a
+ * string or number conforming to a formal identification system.
*/
public static final String SOURCE = "source";
-
+
/**
* The topic of the content of the resource. Typically, a Subject will be
- * expressed as keywords, key phrases or classification codes that describe
- * a topic of the resource. Recommended best practice is to select a value
- * from a controlled vocabulary or formal classification scheme.
+ * expressed as keywords, key phrases or classification codes that describe a
+ * topic of the resource. Recommended best practice is to select a value from
+ * a controlled vocabulary or formal classification scheme.
*/
public static final String SUBJECT = "subject";
-
+
/**
* A name given to the resource. Typically, a Title will be a name by which
* the resource is formally known.
*/
public static final String TITLE = "title";
-
+
/**
* The nature or genre of the content of the resource. Type includes terms
- * describing general categories, functions, genres, or aggregation levels
- * for content. Recommended best practice is to select a value from a
- * controlled vocabulary (for example, the DCMI Type Vocabulary [DCMITYPE]).
- * To describe the physical or digital manifestation of the resource, use
- * the Format element.
+ * describing general categories, functions, genres, or aggregation levels for
+ * content. Recommended best practice is to select a value from a controlled
+ * vocabulary (for example, the DCMI Type Vocabulary [DCMITYPE]). To describe
+ * the physical or digital manifestation of the resource, use the Format
+ * element.
*/
public static final String TYPE = "type";
-
+
}
Modified: nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java Thu Jan 29 05:38:59 2015
@@ -20,32 +20,32 @@ import org.apache.hadoop.io.Text;
/**
* A collection of HTTP header names.
- *
- * @see <a href="http://rfc-ref.org/RFC-TEXTS/2616/">Hypertext Transfer
- * Protocol -- HTTP/1.1 (RFC 2616)</a>
+ *
+ * @see <a href="http://rfc-ref.org/RFC-TEXTS/2616/">Hypertext Transfer Protocol
+ * -- HTTP/1.1 (RFC 2616)</a>
*/
public interface HttpHeaders {
public final static String TRANSFER_ENCODING = "Transfer-Encoding";
-
+
public final static String CONTENT_ENCODING = "Content-Encoding";
-
+
public final static String CONTENT_LANGUAGE = "Content-Language";
public final static String CONTENT_LENGTH = "Content-Length";
-
+
public final static String CONTENT_LOCATION = "Content-Location";
-
+
public static final String CONTENT_DISPOSITION = "Content-Disposition";
public final static String CONTENT_MD5 = "Content-MD5";
-
+
public final static String CONTENT_TYPE = "Content-Type";
public static final Text WRITABLE_CONTENT_TYPE = new Text(CONTENT_TYPE);
-
+
public final static String LAST_MODIFIED = "Last-Modified";
-
+
public final static String LOCATION = "Location";
}
Modified: nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/MetaWrapper.java Thu Jan 29 05:38:59 2015
@@ -28,28 +28,29 @@ import org.apache.nutch.crawl.NutchWrita
/**
* This is a simple decorator that adds metadata to any Writable-s that can be
* serialized by <tt>NutchWritable</tt>. This is useful when data needs to be
- * temporarily enriched during processing, but this
- * temporary metadata doesn't need to be permanently stored after the job is done.
+ * temporarily enriched during processing, but this temporary metadata doesn't
+ * need to be permanently stored after the job is done.
*
* @author Andrzej Bialecki
*/
public class MetaWrapper extends NutchWritable {
private Metadata metadata;
-
+
public MetaWrapper() {
super();
metadata = new Metadata();
}
-
+
public MetaWrapper(Writable instance, Configuration conf) {
super(instance);
metadata = new Metadata();
setConf(conf);
}
-
+
public MetaWrapper(Metadata metadata, Writable instance, Configuration conf) {
super(instance);
- if (metadata == null) metadata = new Metadata();
+ if (metadata == null)
+ metadata = new Metadata();
this.metadata = metadata;
setConf(conf);
}
@@ -60,43 +61,52 @@ public class MetaWrapper extends NutchWr
public Metadata getMetadata() {
return metadata;
}
-
+
/**
- * Add metadata. See {@link Metadata#add(String, String)} for more information.
- * @param name metadata name
- * @param value metadata value
+ * Add metadata. See {@link Metadata#add(String, String)} for more
+ * information.
+ *
+ * @param name
+ * metadata name
+ * @param value
+ * metadata value
*/
public void addMeta(String name, String value) {
metadata.add(name, value);
}
-
+
/**
- * Set metadata. See {@link Metadata#set(String, String)} for more information.
+ * Set metadata. See {@link Metadata#set(String, String)} for more
+ * information.
+ *
* @param name
* @param value
*/
public void setMeta(String name, String value) {
metadata.set(name, value);
}
-
+
/**
* Get metadata. See {@link Metadata#get(String)} for more information.
+ *
* @param name
* @return metadata value
*/
public String getMeta(String name) {
return metadata.get(name);
}
-
+
/**
- * Get multiple metadata. See {@link Metadata#getValues(String)} for more information.
+ * Get multiple metadata. See {@link Metadata#getValues(String)} for more
+ * information.
+ *
* @param name
* @return multiple values
*/
public String[] getMetaValues(String name) {
return metadata.getValues(name);
}
-
+
public void readFields(DataInput in) throws IOException {
super.readFields(in);
metadata = new Metadata();
Modified: nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java Thu Jan 29 05:38:59 2015
@@ -30,15 +30,14 @@ import org.apache.hadoop.io.Writable;
/**
* A multi-valued metadata container.
*/
-public class Metadata implements Writable, CreativeCommons,
-DublinCore, HttpHeaders, Nutch, Feed {
+public class Metadata implements Writable, CreativeCommons, DublinCore,
+ HttpHeaders, Nutch, Feed {
/**
* A map of all metadata attributes.
*/
private Map<String, String[]> metadata = null;
-
/**
* Constructs a new, empty metadata.
*/
@@ -48,9 +47,10 @@ DublinCore, HttpHeaders, Nutch, Feed {
/**
* Returns true if named value is multivalued.
- * @param name name of metadata
- * @return true is named value is multivalued, false if single
- * value or null
+ *
+ * @param name
+ * name of metadata
+ * @return true is named value is multivalued, false if single value or null
*/
public boolean isMultiValued(final String name) {
return metadata.get(name) != null && metadata.get(name).length > 1;
@@ -58,6 +58,7 @@ DublinCore, HttpHeaders, Nutch, Feed {
/**
* Returns an array of the names contained in the metadata.
+ *
* @return Metadata names
*/
public String[] names() {
@@ -65,11 +66,11 @@ DublinCore, HttpHeaders, Nutch, Feed {
}
/**
- * Get the value associated to a metadata name.
- * If many values are assiociated to the specified name, then the first
- * one is returned.
- *
- * @param name of the metadata.
+ * Get the value associated to a metadata name. If many values are assiociated
+ * to the specified name, then the first one is returned.
+ *
+ * @param name
+ * of the metadata.
* @return the value associated to the specified metadata name.
*/
public String get(final String name) {
@@ -83,13 +84,15 @@ DublinCore, HttpHeaders, Nutch, Feed {
/**
* Get the values associated to a metadata name.
- * @param name of the metadata.
+ *
+ * @param name
+ * of the metadata.
* @return the values associated to a metadata name.
*/
public String[] getValues(final String name) {
return _getValues(name);
}
-
+
private String[] _getValues(final String name) {
String[] values = metadata.get(name);
if (values == null) {
@@ -99,12 +102,13 @@ DublinCore, HttpHeaders, Nutch, Feed {
}
/**
- * Add a metadata name/value mapping.
- * Add the specified value to the list of values associated to the
- * specified metadata name.
- *
- * @param name the metadata name.
- * @param value the metadata value.
+ * Add a metadata name/value mapping. Add the specified value to the list of
+ * values associated to the specified metadata name.
+ *
+ * @param name
+ * the metadata name.
+ * @param value
+ * the metadata value.
*/
public void add(final String name, final String value) {
String[] values = metadata.get(name);
@@ -120,31 +124,37 @@ DublinCore, HttpHeaders, Nutch, Feed {
/**
* Copy All key-value pairs from properties.
- * @param properties properties to copy from
+ *
+ * @param properties
+ * properties to copy from
*/
public void setAll(Properties properties) {
Enumeration<?> names = properties.propertyNames();
while (names.hasMoreElements()) {
String name = (String) names.nextElement();
- metadata.put(name, new String[]{properties.getProperty(name)});
+ metadata.put(name, new String[] { properties.getProperty(name) });
}
}
/**
- * Set metadata name/value.
- * Associate the specified value to the specified metadata name. If some
- * previous values were associated to this name, they are removed.
- *
- * @param name the metadata name.
- * @param value the metadata value.
+ * Set metadata name/value. Associate the specified value to the specified
+ * metadata name. If some previous values were associated to this name, they
+ * are removed.
+ *
+ * @param name
+ * the metadata name.
+ * @param value
+ * the metadata value.
*/
public void set(String name, String value) {
- metadata.put(name, new String[]{value});
+ metadata.put(name, new String[] { value });
}
/**
* Remove a metadata and all its associated values.
- * @param name metadata name to remove
+ *
+ * @param name
+ * metadata name to remove
*/
public void remove(String name) {
metadata.remove(name);
@@ -152,12 +162,13 @@ DublinCore, HttpHeaders, Nutch, Feed {
/**
* Returns the number of metadata names in this metadata.
+ *
* @return number of metadata names
*/
public int size() {
return metadata.size();
}
-
+
/** Remove all mappings from metadata. */
public void clear() {
metadata.clear();
@@ -165,7 +176,9 @@ DublinCore, HttpHeaders, Nutch, Feed {
public boolean equals(Object o) {
- if (o == null) { return false; }
+ if (o == null) {
+ return false;
+ }
Metadata other = null;
try {
@@ -174,7 +187,9 @@ DublinCore, HttpHeaders, Nutch, Feed {
return false;
}
- if (other.size() != size()) { return false; }
+ if (other.size() != size()) {
+ return false;
+ }
String[] names = names();
for (int i = 0; i < names.length; i++) {
@@ -198,10 +213,7 @@ DublinCore, HttpHeaders, Nutch, Feed {
for (int i = 0; i < names.length; i++) {
String[] values = _getValues(names[i]);
for (int j = 0; j < values.length; j++) {
- buf.append(names[i])
- .append("=")
- .append(values[j])
- .append(" ");
+ buf.append(names[i]).append("=").append(values[j]).append(" ");
}
}
return buf.toString();
Modified: nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/Nutch.java Thu Jan 29 05:38:59 2015
@@ -18,20 +18,17 @@ package org.apache.nutch.metadata;
import org.apache.hadoop.io.Text;
-
/**
* A collection of Nutch internal metadata constants.
- *
+ *
* @author Chris Mattmann
* @author Jérôme Charron
*/
public interface Nutch {
-
- public static final String ORIGINAL_CHAR_ENCODING =
- "OriginalCharEncoding";
-
- public static final String CHAR_ENCODING_FOR_CONVERSION =
- "CharEncodingForConversion";
+
+ public static final String ORIGINAL_CHAR_ENCODING = "OriginalCharEncoding";
+
+ public static final String CHAR_ENCODING_FOR_CONVERSION = "CharEncodingForConversion";
public static final String SIGNATURE_KEY = "nutch.content.digest";
@@ -41,17 +38,22 @@ public interface Nutch {
public static final String GENERATE_TIME_KEY = "_ngt_";
- public static final Text WRITABLE_GENERATE_TIME_KEY = new Text(GENERATE_TIME_KEY);
+ public static final Text WRITABLE_GENERATE_TIME_KEY = new Text(
+ GENERATE_TIME_KEY);
public static final String PROTO_STATUS_KEY = "_pst_";
- public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(PROTO_STATUS_KEY);
-
+ public static final Text WRITABLE_PROTO_STATUS_KEY = new Text(
+ PROTO_STATUS_KEY);
+
public static final String FETCH_TIME_KEY = "_ftk_";
-
+
public static final String FETCH_STATUS_KEY = "_fst_";
- /** Sites may request that search engines don't provide access to cached documents. */
+ /**
+ * Sites may request that search engines don't provide access to cached
+ * documents.
+ */
public static final String CACHING_FORBIDDEN_KEY = "caching.forbidden";
/** Show both original forbidden content and summaries (default). */
@@ -70,5 +72,6 @@ public interface Nutch {
/** Used by AdaptiveFetchSchedule to maintain custom fetch interval */
public static final String FIXED_INTERVAL_KEY = "fixedInterval";
- public static final Text WRITABLE_FIXED_INTERVAL_KEY = new Text(FIXED_INTERVAL_KEY);
+ public static final Text WRITABLE_FIXED_INTERVAL_KEY = new Text(
+ FIXED_INTERVAL_KEY);
}
Modified: nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java Thu Jan 29 05:38:59 2015
@@ -33,7 +33,7 @@ public class SpellCheckedMetadata extend
/**
* Treshold divider.
- *
+ *
* <code>threshold = searched.length() / TRESHOLD_DIVIDER;</code>
*/
private static final int TRESHOLD_DIVIDER = 3;
@@ -52,7 +52,7 @@ public class SpellCheckedMetadata extend
// Uses following array to fill the metanames index and the
// metanames list.
- Class<?>[] spellthese = {HttpHeaders.class};
+ Class<?>[] spellthese = { HttpHeaders.class };
for (Class<?> spellCheckedNames : spellthese) {
for (Field field : spellCheckedNames.getFields()) {
@@ -73,7 +73,7 @@ public class SpellCheckedMetadata extend
/**
* Normalizes String.
- *
+ *
* @param str
* the string to normalize
* @return normalized String
@@ -102,7 +102,7 @@ public class SpellCheckedMetadata extend
* </ul>
* If no matching with a well-known metadata name is found, then the original
* name is returned.
- *
+ *
* @param name
* Name to normalize
* @return normalized name
Modified: nutch/trunk/src/java/org/apache/nutch/net/URLFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/URLFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/URLFilter.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/URLFilter.java Thu Jan 29 05:38:59 2015
@@ -23,17 +23,18 @@ import org.apache.hadoop.conf.Configurab
// Nutch imports
import org.apache.nutch.plugin.Pluggable;
-
/**
- * Interface used to limit which URLs enter Nutch.
- * Used by the injector and the db updater.
+ * Interface used to limit which URLs enter Nutch. Used by the injector and the
+ * db updater.
*/
public interface URLFilter extends Pluggable, Configurable {
/** The name of the extension point. */
public final static String X_POINT_ID = URLFilter.class.getName();
- /* Interface for a filter that transforms a URL: it can pass the
- original URL through or "delete" the URL by returning null */
+ /*
+ * Interface for a filter that transforms a URL: it can pass the original URL
+ * through or "delete" the URL by returning null
+ */
public String filter(String urlString);
}
Modified: nutch/trunk/src/java/org/apache/nutch/net/URLFilterChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/URLFilterChecker.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/URLFilterChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/URLFilterChecker.java Thu Jan 29 05:38:59 2015
@@ -38,23 +38,23 @@ public class URLFilterChecker {
private Configuration conf;
public URLFilterChecker(Configuration conf) {
- this.conf = conf;
+ this.conf = conf;
}
private void checkOne(String filterName) throws Exception {
URLFilter filter = null;
- ExtensionPoint point =
- PluginRepository.get(conf).getExtensionPoint(URLFilter.X_POINT_ID);
+ ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+ URLFilter.X_POINT_ID);
if (point == null)
- throw new RuntimeException(URLFilter.X_POINT_ID+" not found.");
+ throw new RuntimeException(URLFilter.X_POINT_ID + " not found.");
Extension[] extensions = point.getExtensions();
for (int i = 0; i < extensions.length; i++) {
Extension extension = extensions[i];
- filter = (URLFilter)extension.getExtensionInstance();
+ filter = (URLFilter) extension.getExtensionInstance();
if (filter.getClass().getName().equals(filterName)) {
break;
} else {
@@ -63,19 +63,19 @@ public class URLFilterChecker {
}
if (filter == null)
- throw new RuntimeException("Filter "+filterName+" not found.");
+ throw new RuntimeException("Filter " + filterName + " not found.");
// jerome : should we keep this behavior?
- //if (LogFormatter.hasLoggedSevere())
- // throw new RuntimeException("Severe error encountered.");
+ // if (LogFormatter.hasLoggedSevere())
+ // throw new RuntimeException("Severe error encountered.");
- System.out.println("Checking URLFilter "+filterName);
+ System.out.println("Checking URLFilter " + filterName);
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
String line;
- while((line=in.readLine())!=null) {
- String out=filter.filter(line);
- if(out!=null) {
+ while ((line = in.readLine()) != null) {
+ String out = filter.filter(line);
+ if (out != null) {
System.out.print("+");
System.out.println(out);
} else {
@@ -90,10 +90,10 @@ public class URLFilterChecker {
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
String line;
- while((line=in.readLine())!=null) {
+ while ((line = in.readLine()) != null) {
URLFilters filters = new URLFilters(this.conf);
String out = filters.filter(line);
- if(out!=null) {
+ if (out != null) {
System.out.print("+");
System.out.println(out);
} else {
@@ -105,8 +105,8 @@ public class URLFilterChecker {
public static void main(String[] args) throws Exception {
- String usage = "Usage: URLFilterChecker (-filterName filterName | -allCombined) \n"
- + "Tool takes a list of URLs, one per line, passed via STDIN.\n";
+ String usage = "Usage: URLFilterChecker (-filterName filterName | -allCombined) \n"
+ + "Tool takes a list of URLs, one per line, passed via STDIN.\n";
if (args.length == 0) {
System.err.println(usage);
Modified: nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/URLFilters.java Thu Jan 29 05:38:59 2015
@@ -20,16 +20,15 @@ package org.apache.nutch.net;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.plugin.PluginRepository;
-/** Creates and caches {@link URLFilter} implementing plugins.*/
+/** Creates and caches {@link URLFilter} implementing plugins. */
public class URLFilters {
public static final String URLFILTER_ORDER = "urlfilter.order";
private URLFilter[] filters;
public URLFilters(Configuration conf) {
- this.filters = (URLFilter[]) PluginRepository.get(conf)
- .getOrderedPlugins(URLFilter.class, URLFilter.X_POINT_ID,
- URLFILTER_ORDER);
+ this.filters = (URLFilter[]) PluginRepository.get(conf).getOrderedPlugins(
+ URLFilter.class, URLFilter.X_POINT_ID, URLFILTER_ORDER);
}
/** Run all defined filters. Assume logical AND. */
Modified: nutch/trunk/src/java/org/apache/nutch/net/URLNormalizer.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizer.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/URLNormalizer.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/URLNormalizer.java Thu Jan 29 05:38:59 2015
@@ -21,13 +21,17 @@ import java.net.MalformedURLException;
import org.apache.hadoop.conf.Configurable;
-/** Interface used to convert URLs to normal form and optionally perform substitutions */
+/**
+ * Interface used to convert URLs to normal form and optionally perform
+ * substitutions
+ */
public interface URLNormalizer extends Configurable {
-
+
/* Extension ID */
public static final String X_POINT_ID = URLNormalizer.class.getName();
-
+
/* Interface for URL normalization */
- public String normalize(String urlString, String scope) throws MalformedURLException;
+ public String normalize(String urlString, String scope)
+ throws MalformedURLException;
}
Modified: nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/URLNormalizerChecker.java Thu Jan 29 05:38:59 2015
@@ -36,23 +36,23 @@ public class URLNormalizerChecker {
private Configuration conf;
public URLNormalizerChecker(Configuration conf) {
- this.conf = conf;
+ this.conf = conf;
}
private void checkOne(String normalizerName, String scope) throws Exception {
URLNormalizer normalizer = null;
- ExtensionPoint point =
- PluginRepository.get(conf).getExtensionPoint(URLNormalizer.X_POINT_ID);
+ ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+ URLNormalizer.X_POINT_ID);
if (point == null)
- throw new RuntimeException(URLNormalizer.X_POINT_ID+" not found.");
+ throw new RuntimeException(URLNormalizer.X_POINT_ID + " not found.");
Extension[] extensions = point.getExtensions();
for (int i = 0; i < extensions.length; i++) {
Extension extension = extensions[i];
- normalizer = (URLNormalizer)extension.getExtensionInstance();
+ normalizer = (URLNormalizer) extension.getExtensionInstance();
if (normalizer.getClass().getName().equals(normalizerName)) {
break;
} else {
@@ -61,7 +61,8 @@ public class URLNormalizerChecker {
}
if (normalizer == null)
- throw new RuntimeException("URLNormalizer "+normalizerName+" not found.");
+ throw new RuntimeException("URLNormalizer " + normalizerName
+ + " not found.");
System.out.println("Checking URLNormalizer " + normalizerName);
@@ -79,7 +80,7 @@ public class URLNormalizerChecker {
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
String line;
URLNormalizers normalizers = new URLNormalizers(conf, scope);
- while((line = in.readLine()) != null) {
+ while ((line = in.readLine()) != null) {
String out = normalizers.normalize(line, scope);
System.out.println(out);
}
@@ -88,7 +89,7 @@ public class URLNormalizerChecker {
public static void main(String[] args) throws Exception {
String usage = "Usage: URLNormalizerChecker [-normalizer <normalizerName>] [-scope <scope>]"
- + "\n\tscope can be one of: default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink";
+ + "\n\tscope can be one of: default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink";
String normalizerName = null;
String scope = URLNormalizers.SCOPE_DEFAULT;
@@ -103,7 +104,8 @@ public class URLNormalizerChecker {
}
}
- URLNormalizerChecker checker = new URLNormalizerChecker(NutchConfiguration.create());
+ URLNormalizerChecker checker = new URLNormalizerChecker(
+ NutchConfiguration.create());
if (normalizerName != null) {
checker.checkOne(normalizerName, scope);
} else {
Modified: nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java Thu Jan 29 05:38:59 2015
@@ -43,47 +43,63 @@ import org.apache.nutch.util.ObjectCache
* contexts where they are used (note however that they need to be activated
* first through <tt>plugin.include</tt> property).
*
- * <p>There is one global scope defined by default, which consists of all
- * active normalizers. The order in which these normalizers
- * are executed may be defined in "urlnormalizer.order" property, which lists
- * space-separated implementation classes (if this property is missing normalizers
- * will be run in random order). If there are more
- * normalizers activated than explicitly named on this list, the remaining ones
- * will be run in random order after the ones specified on the list are executed.</p>
- * <p>You can define a set of contexts (or scopes) in which normalizers may be
+ * <p>
+ * There is one global scope defined by default, which consists of all active
+ * normalizers. The order in which these normalizers are executed may be defined
+ * in "urlnormalizer.order" property, which lists space-separated implementation
+ * classes (if this property is missing normalizers will be run in random
+ * order). If there are more normalizers activated than explicitly named on this
+ * list, the remaining ones will be run in random order after the ones specified
+ * on the list are executed.
+ * </p>
+ * <p>
+ * You can define a set of contexts (or scopes) in which normalizers may be
* called. Each scope can have its own list of normalizers (defined in
* "urlnormalizer.scope.<scope_name>" property) and its own order (defined in
* "urlnormalizer.order.<scope_name>" property). If any of these properties are
- * missing, default settings are used for the global scope.</p>
- * <p>In case no normalizers are required for any given scope, a
- * <code>org.apache.nutch.net.urlnormalizer.pass.PassURLNormalizer</code> should be used.</p>
- * <p>Each normalizer may further select among many configurations, depending on
- * the scope in which it is called, because the scope name is passed as a parameter
- * to each normalizer. You can also use the same normalizer for many scopes.</p>
- * <p>Several scopes have been defined, and various Nutch tools will attempt using
- * scope-specific normalizers first (and fall back to default config if scope-specific
- * configuration is missing).</p>
- * <p>Normalizers may be run several times, to ensure that modifications introduced
+ * missing, default settings are used for the global scope.
+ * </p>
+ * <p>
+ * In case no normalizers are required for any given scope, a
+ * <code>org.apache.nutch.net.urlnormalizer.pass.PassURLNormalizer</code> should
+ * be used.
+ * </p>
+ * <p>
+ * Each normalizer may further select among many configurations, depending on
+ * the scope in which it is called, because the scope name is passed as a
+ * parameter to each normalizer. You can also use the same normalizer for many
+ * scopes.
+ * </p>
+ * <p>
+ * Several scopes have been defined, and various Nutch tools will attempt using
+ * scope-specific normalizers first (and fall back to default config if
+ * scope-specific configuration is missing).
+ * </p>
+ * <p>
+ * Normalizers may be run several times, to ensure that modifications introduced
* by normalizers at the end of the list can be further reduced by normalizers
- * executed at the beginning. By default this loop is executed just once - if you want
- * to ensure that all possible combinations have been applied you may want to run
- * this loop up to the number of activated normalizers. This loop count can be configured
- * through <tt>urlnormalizer.loop.count</tt> property. As soon as the url is
- * unchanged the loop will stop and return the result.</p>
+ * executed at the beginning. By default this loop is executed just once - if
+ * you want to ensure that all possible combinations have been applied you may
+ * want to run this loop up to the number of activated normalizers. This loop
+ * count can be configured through <tt>urlnormalizer.loop.count</tt> property.
+ * As soon as the url is unchanged the loop will stop and return the result.
+ * </p>
*
* @author Andrzej Bialecki
*/
public final class URLNormalizers {
-
- /** Default scope. If no scope properties are defined then the configuration for
- * this scope will be used.
+
+ /**
+ * Default scope. If no scope properties are defined then the configuration
+ * for this scope will be used.
*/
public static final String SCOPE_DEFAULT = "default";
/** Scope used by {@link org.apache.nutch.crawl.URLPartitioner}. */
public static final String SCOPE_PARTITION = "partition";
/** Scope used by {@link org.apache.nutch.crawl.Generator}. */
public static final String SCOPE_GENERATE_HOST_COUNT = "generate_host_count";
- /** Scope used by {@link org.apache.nutch.fetcher.Fetcher} when processing
+ /**
+ * Scope used by {@link org.apache.nutch.fetcher.Fetcher} when processing
* redirect URLs.
*/
public static final String SCOPE_FETCHER = "fetcher";
@@ -93,16 +109,21 @@ public final class URLNormalizers {
public static final String SCOPE_LINKDB = "linkdb";
/** Scope used by {@link org.apache.nutch.crawl.Injector}. */
public static final String SCOPE_INJECT = "inject";
- /** Scope used when constructing new {@link org.apache.nutch.parse.Outlink} instances. */
+ /**
+ * Scope used when constructing new {@link org.apache.nutch.parse.Outlink}
+ * instances.
+ */
public static final String SCOPE_OUTLINK = "outlink";
/** Scope used when indexing URLs. */
public static final String SCOPE_INDEXER = "indexer";
- public static final Logger LOG = LoggerFactory.getLogger(URLNormalizers.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(URLNormalizers.class);
/* Empty extension list for caching purposes. */
- private final List<Extension> EMPTY_EXTENSION_LIST = Collections.<Extension>emptyList();
-
+ private final List<Extension> EMPTY_EXTENSION_LIST = Collections
+ .<Extension> emptyList();
+
private final URLNormalizer[] EMPTY_NORMALIZERS = new URLNormalizer[0];
private Configuration conf;
@@ -110,37 +131,39 @@ public final class URLNormalizers {
private ExtensionPoint extensionPoint;
private URLNormalizer[] normalizers;
-
+
private int loopCount;
public URLNormalizers(Configuration conf, String scope) {
this.conf = conf;
this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(
- URLNormalizer.X_POINT_ID);
+ URLNormalizer.X_POINT_ID);
ObjectCache objectCache = ObjectCache.get(conf);
-
+
if (this.extensionPoint == null) {
throw new RuntimeException("x point " + URLNormalizer.X_POINT_ID
- + " not found.");
+ + " not found.");
}
- normalizers = (URLNormalizer[])objectCache.getObject(URLNormalizer.X_POINT_ID + "_" + scope);
+ normalizers = (URLNormalizer[]) objectCache
+ .getObject(URLNormalizer.X_POINT_ID + "_" + scope);
if (normalizers == null) {
normalizers = getURLNormalizers(scope);
}
if (normalizers == EMPTY_NORMALIZERS) {
- normalizers = (URLNormalizer[])objectCache.getObject(URLNormalizer.X_POINT_ID + "_" + SCOPE_DEFAULT);
+ normalizers = (URLNormalizer[]) objectCache
+ .getObject(URLNormalizer.X_POINT_ID + "_" + SCOPE_DEFAULT);
if (normalizers == null) {
normalizers = getURLNormalizers(SCOPE_DEFAULT);
}
}
-
+
loopCount = conf.getInt("urlnormalizer.loop.count", 1);
}
/**
- * Function returns an array of {@link URLNormalizer}s for a given scope,
- * with a specified order.
+ * Function returns an array of {@link URLNormalizer}s for a given scope, with
+ * a specified order.
*
* @param scope
* The scope to return the <code>Array</code> of
@@ -152,12 +175,13 @@ public final class URLNormalizers {
URLNormalizer[] getURLNormalizers(String scope) {
List<Extension> extensions = getExtensions(scope);
ObjectCache objectCache = ObjectCache.get(conf);
-
+
if (extensions == EMPTY_EXTENSION_LIST) {
return EMPTY_NORMALIZERS;
}
-
- List<URLNormalizer> normalizers = new Vector<URLNormalizer>(extensions.size());
+
+ List<URLNormalizer> normalizers = new Vector<URLNormalizer>(
+ extensions.size());
Iterator<Extension> it = extensions.iterator();
while (it.hasNext()) {
@@ -175,14 +199,13 @@ public final class URLNormalizers {
} catch (PluginRuntimeException e) {
e.printStackTrace();
LOG.warn("URLNormalizers:PluginRuntimeException when "
- + "initializing url normalizer plugin "
- + ext.getDescriptor().getPluginId()
- + " instance in getURLNormalizers "
- + "function: attempting to continue instantiating plugins");
+ + "initializing url normalizer plugin "
+ + ext.getDescriptor().getPluginId()
+ + " instance in getURLNormalizers "
+ + "function: attempting to continue instantiating plugins");
}
}
- return normalizers.toArray(new URLNormalizer[normalizers
- .size()]);
+ return normalizers.toArray(new URLNormalizer[normalizers.size()]);
}
/**
@@ -197,9 +220,8 @@ public final class URLNormalizers {
@SuppressWarnings("unchecked")
private List<Extension> getExtensions(String scope) {
ObjectCache objectCache = ObjectCache.get(conf);
- List<Extension> extensions =
- (List<Extension>) objectCache.getObject(URLNormalizer.X_POINT_ID + "_x_"
- + scope);
+ List<Extension> extensions = (List<Extension>) objectCache
+ .getObject(URLNormalizer.X_POINT_ID + "_x_" + scope);
// Just compare the reference:
// if this is the empty list, we know we will find no extension.
@@ -210,11 +232,13 @@ public final class URLNormalizers {
if (extensions == null) {
extensions = findExtensions(scope);
if (extensions != null) {
- objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope, extensions);
+ objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope,
+ extensions);
} else {
// Put the empty extension list into cache
// to remember we don't know any related extension.
- objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope, EMPTY_EXTENSION_LIST);
+ objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope,
+ EMPTY_EXTENSION_LIST);
extensions = EMPTY_EXTENSION_LIST;
}
}
@@ -234,7 +258,8 @@ public final class URLNormalizers {
String[] orders = null;
String orderlist = conf.get("urlnormalizer.order." + scope);
- if (orderlist == null) orderlist = conf.get("urlnormalizer.order");
+ if (orderlist == null)
+ orderlist = conf.get("urlnormalizer.order");
if (orderlist != null && !orderlist.trim().equals("")) {
orders = orderlist.trim().split("\\s+");
}
@@ -272,13 +297,17 @@ public final class URLNormalizers {
/**
* Normalize
- * @param urlString The URL string to normalize.
- * @param scope The given scope.
+ *
+ * @param urlString
+ * The URL string to normalize.
+ * @param scope
+ * The given scope.
* @return A normalized String, using the given <code>scope</code>
- * @throws MalformedURLException If the given URL string is malformed.
+ * @throws MalformedURLException
+ * If the given URL string is malformed.
*/
public String normalize(String urlString, String scope)
- throws MalformedURLException {
+ throws MalformedURLException {
// optionally loop several times, and break if no further changes
String initialString = urlString;
for (int k = 0; k < loopCount; k++) {
@@ -287,7 +316,8 @@ public final class URLNormalizers {
return null;
urlString = this.normalizers[i].normalize(urlString, scope);
}
- if (initialString.equals(urlString)) break;
+ if (initialString.equals(urlString))
+ break;
initialString = urlString;
}
return urlString;
Modified: nutch/trunk/src/java/org/apache/nutch/net/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/package-info.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/package-info.java Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
* and {@link org.apache.nutch.net.URLNormalizer normalizers}.
*/
package org.apache.nutch.net;
+
Modified: nutch/trunk/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java Thu Jan 29 05:38:59 2015
@@ -26,15 +26,15 @@ import java.text.ParseException;
/**
* class to handle HTTP dates.
- *
+ *
* Modified from FastHttpDateFormat.java in jakarta-tomcat.
- *
+ *
* @author John Xing
*/
public class HttpDateFormat {
- protected static SimpleDateFormat format =
- new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US);
+ protected static SimpleDateFormat format = new SimpleDateFormat(
+ "EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US);
/**
* HTTP date uses TimeZone GMT
@@ -43,29 +43,29 @@ public class HttpDateFormat {
format.setTimeZone(TimeZone.getTimeZone("GMT"));
}
- //HttpDate (long t) {
- //}
+ // HttpDate (long t) {
+ // }
- //HttpDate (String s) {
- //}
+ // HttpDate (String s) {
+ // }
-// /**
-// * Get the current date in HTTP format.
-// */
-// public static String getCurrentDate() {
-//
-// long now = System.currentTimeMillis();
-// if ((now - currentDateGenerated) > 1000) {
-// synchronized (format) {
-// if ((now - currentDateGenerated) > 1000) {
-// currentDateGenerated = now;
-// currentDate = format.format(new Date(now));
-// }
-// }
-// }
-// return currentDate;
-//
-// }
+ // /**
+ // * Get the current date in HTTP format.
+ // */
+ // public static String getCurrentDate() {
+ //
+ // long now = System.currentTimeMillis();
+ // if ((now - currentDateGenerated) > 1000) {
+ // synchronized (format) {
+ // if ((now - currentDateGenerated) > 1000) {
+ // currentDateGenerated = now;
+ // currentDate = format.format(new Date(now));
+ // }
+ // }
+ // }
+ // return currentDate;
+ //
+ // }
/**
* Get the HTTP format of the specified date.
Modified: nutch/trunk/src/java/org/apache/nutch/net/protocols/ProtocolException.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/protocols/ProtocolException.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/protocols/ProtocolException.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/protocols/ProtocolException.java Thu Jan 29 05:38:59 2015
@@ -21,13 +21,13 @@ import java.io.Serializable;
/**
* Base exception for all protocol handlers
+ *
* @deprecated Use {@link org.apache.nutch.protocol.ProtocolException} instead.
*/
@Deprecated
@SuppressWarnings("serial")
public class ProtocolException extends Exception implements Serializable {
-
public ProtocolException() {
super();
}
Modified: nutch/trunk/src/java/org/apache/nutch/net/protocols/Response.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/protocols/Response.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/protocols/Response.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/protocols/Response.java Thu Jan 29 05:38:59 2015
@@ -23,12 +23,11 @@ import java.net.URL;
import org.apache.nutch.metadata.HttpHeaders;
import org.apache.nutch.metadata.Metadata;
-
/**
- * A response interface. Makes all protocols model HTTP.
+ * A response interface. Makes all protocols model HTTP.
*/
public interface Response extends HttpHeaders {
-
+
/** Returns the URL used to retrieve this response. */
public URL getUrl();
@@ -40,7 +39,7 @@ public interface Response extends HttpHe
/** Returns all the headers. */
public Metadata getHeaders();
-
+
/** Returns the full content of the response. */
public byte[] getContent();
Modified: nutch/trunk/src/java/org/apache/nutch/net/protocols/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/protocols/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/net/protocols/package-info.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/net/protocols/package-info.java Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
* interface, sea also {@link org.apache.nutch.protocol}.
*/
package org.apache.nutch.net.protocols;
+
Modified: nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java Thu Jan 29 05:38:59 2015
@@ -24,8 +24,8 @@ import java.util.Properties;
import org.apache.nutch.metadata.Metadata;
/**
- * This class holds the information about HTML "meta" tags extracted from
- * a page. Some special tags have convenience methods for easy checking.
+ * This class holds the information about HTML "meta" tags extracted from a
+ * page. Some special tags have convenience methods for easy checking.
*/
public class HTMLMetaTags {
private boolean noIndex = false;
@@ -45,7 +45,7 @@ public class HTMLMetaTags {
private Metadata generalTags = new Metadata();
private Properties httpEquivTags = new Properties();
-
+
/**
* Sets all boolean values to <code>false</code>. Clears all other tags.
*/
@@ -156,8 +156,8 @@ public class HTMLMetaTags {
}
/**
- * A convenience method. Returns the current value of <code>refreshTime</code>.
- * The value may be invalid if {@link #getRefresh()}returns
+ * A convenience method. Returns the current value of <code>refreshTime</code>
+ * . The value may be invalid if {@link #getRefresh()}returns
* <code>false</code>.
*/
public int getRefreshTime() {
@@ -179,16 +179,12 @@ public class HTMLMetaTags {
public Properties getHttpEquivTags() {
return httpEquivTags;
}
-
+
public String toString() {
StringBuffer sb = new StringBuffer();
- sb.append("base=" + baseHref
- + ", noCache=" + noCache
- + ", noFollow=" + noFollow
- + ", noIndex=" + noIndex
- + ", refresh=" + refresh
- + ", refreshHref=" + refreshHref + "\n"
- );
+ sb.append("base=" + baseHref + ", noCache=" + noCache + ", noFollow="
+ + noFollow + ", noIndex=" + noIndex + ", refresh=" + refresh
+ + ", refreshHref=" + refreshHref + "\n");
sb.append(" * general tags:\n");
String[] names = generalTags.names();
for (String name : names) {
@@ -199,7 +195,7 @@ public class HTMLMetaTags {
Iterator<Object> it = httpEquivTags.keySet().iterator();
it = httpEquivTags.keySet().iterator();
while (it.hasNext()) {
- String key = (String)it.next();
+ String key = (String) it.next();
sb.append(" - " + key + "\t=\t" + httpEquivTags.get(key) + "\n");
}
return sb.toString();
Modified: nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilter.java Thu Jan 29 05:38:59 2015
@@ -27,16 +27,19 @@ import org.apache.hadoop.conf.Configurab
import org.apache.nutch.plugin.Pluggable;
import org.apache.nutch.protocol.Content;
-
-/** Extension point for DOM-based HTML parsers. Permits one to add additional
- * metadata to HTML parses. All plugins found which implement this extension
+/**
+ * Extension point for DOM-based HTML parsers. Permits one to add additional
+ * metadata to HTML parses. All plugins found which implement this extension
* point are run sequentially on the parse.
*/
public interface HtmlParseFilter extends Pluggable, Configurable {
/** The name of the extension point. */
final static String X_POINT_ID = HtmlParseFilter.class.getName();
- /** Adds metadata or otherwise modifies a parse of HTML content, given
- * the DOM tree of a page. */
- ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc);
+ /**
+ * Adds metadata or otherwise modifies a parse of HTML content, given the DOM
+ * tree of a page.
+ */
+ ParseResult filter(Content content, ParseResult parseResult,
+ HTMLMetaTags metaTags, DocumentFragment doc);
}
Modified: nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/HtmlParseFilters.java Thu Jan 29 05:38:59 2015
@@ -23,11 +23,11 @@ import org.apache.hadoop.conf.Configurat
import org.w3c.dom.DocumentFragment;
-/** Creates and caches {@link HtmlParseFilter} implementing plugins.*/
+/** Creates and caches {@link HtmlParseFilter} implementing plugins. */
public class HtmlParseFilters {
private HtmlParseFilter[] htmlParseFilters;
-
+
public static final String HTMLPARSEFILTER_ORDER = "htmlparsefilter.order";
public HtmlParseFilters(Configuration conf) {
@@ -37,13 +37,14 @@ public class HtmlParseFilters {
}
/** Run all defined filters. */
- public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
+ public ParseResult filter(Content content, ParseResult parseResult,
+ HTMLMetaTags metaTags, DocumentFragment doc) {
// loop on each filter
- for (int i = 0 ; i < this.htmlParseFilters.length; i++) {
+ for (int i = 0; i < this.htmlParseFilters.length; i++) {
// call filter interface
- parseResult =
- htmlParseFilters[i].filter(content, parseResult, metaTags, doc);
+ parseResult = htmlParseFilters[i].filter(content, parseResult, metaTags,
+ doc);
// any failure on parse obj, return
if (!parseResult.isSuccess()) {
Modified: nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java Thu Jan 29 05:38:59 2015
@@ -30,103 +30,102 @@ import org.apache.hadoop.io.Writable;
/* An outgoing link from a page. */
public class Outlink implements Writable {
- private String toUrl;
- private String anchor;
- private MapWritable md;
-
- public Outlink() {
- }
-
- public Outlink(String toUrl, String anchor) throws MalformedURLException {
- this.toUrl = toUrl;
- if (anchor == null)
- anchor = "";
- this.anchor = anchor;
- md = null;
- }
-
- public void readFields(DataInput in) throws IOException {
- toUrl = Text.readString(in);
- anchor = Text.readString(in);
- boolean hasMD = in.readBoolean();
- if (hasMD) {
- md = new org.apache.hadoop.io.MapWritable();
- md.readFields(in);
- } else
- md = null;
- }
-
- /** Skips over one Outlink in the input. */
- public static void skip(DataInput in) throws IOException {
- Text.skip(in); // skip toUrl
- Text.skip(in); // skip anchor
- boolean hasMD = in.readBoolean();
- if (hasMD) {
- MapWritable metadata = new org.apache.hadoop.io.MapWritable();
- metadata.readFields(in);
- ;
- }
- }
-
- public void write(DataOutput out) throws IOException {
- Text.writeString(out, toUrl);
- Text.writeString(out, anchor);
- if (md != null && md.size() > 0) {
- out.writeBoolean(true);
- md.write(out);
- } else {
- out.writeBoolean(false);
- }
- }
-
- public static Outlink read(DataInput in) throws IOException {
- Outlink outlink = new Outlink();
- outlink.readFields(in);
- return outlink;
- }
-
- public String getToUrl() {
- return toUrl;
- }
-
- public void setUrl(String toUrl) {
- this.toUrl = toUrl;
- }
-
- public String getAnchor() {
- return anchor;
- }
-
- public MapWritable getMetadata() {
- return md;
- }
-
- public void setMetadata(MapWritable md) {
- this.md = md;
- }
-
- public boolean equals(Object o) {
- if (!(o instanceof Outlink))
- return false;
- Outlink other = (Outlink) o;
- return this.toUrl.equals(other.toUrl)
- && this.anchor.equals(other.anchor);
- }
-
- public String toString() {
- StringBuffer repr = new StringBuffer("toUrl: ");
- repr.append(toUrl);
- repr.append(" anchor: ");
- repr.append(anchor);
- if (md != null && !md.isEmpty()) {
- for (Entry<Writable, Writable> e : md.entrySet()) {
- repr.append(" ");
- repr.append(e.getKey());
- repr.append(": ");
- repr.append(e.getValue());
- }
- }
- return repr.toString();
+ private String toUrl;
+ private String anchor;
+ private MapWritable md;
+
+ public Outlink() {
+ }
+
+ public Outlink(String toUrl, String anchor) throws MalformedURLException {
+ this.toUrl = toUrl;
+ if (anchor == null)
+ anchor = "";
+ this.anchor = anchor;
+ md = null;
+ }
+
+ public void readFields(DataInput in) throws IOException {
+ toUrl = Text.readString(in);
+ anchor = Text.readString(in);
+ boolean hasMD = in.readBoolean();
+ if (hasMD) {
+ md = new org.apache.hadoop.io.MapWritable();
+ md.readFields(in);
+ } else
+ md = null;
+ }
+
+ /** Skips over one Outlink in the input. */
+ public static void skip(DataInput in) throws IOException {
+ Text.skip(in); // skip toUrl
+ Text.skip(in); // skip anchor
+ boolean hasMD = in.readBoolean();
+ if (hasMD) {
+ MapWritable metadata = new org.apache.hadoop.io.MapWritable();
+ metadata.readFields(in);
+ ;
+ }
+ }
+
+ public void write(DataOutput out) throws IOException {
+ Text.writeString(out, toUrl);
+ Text.writeString(out, anchor);
+ if (md != null && md.size() > 0) {
+ out.writeBoolean(true);
+ md.write(out);
+ } else {
+ out.writeBoolean(false);
+ }
+ }
+
+ public static Outlink read(DataInput in) throws IOException {
+ Outlink outlink = new Outlink();
+ outlink.readFields(in);
+ return outlink;
+ }
+
+ public String getToUrl() {
+ return toUrl;
+ }
+
+ public void setUrl(String toUrl) {
+ this.toUrl = toUrl;
+ }
+
+ public String getAnchor() {
+ return anchor;
+ }
+
+ public MapWritable getMetadata() {
+ return md;
+ }
+
+ public void setMetadata(MapWritable md) {
+ this.md = md;
+ }
+
+ public boolean equals(Object o) {
+ if (!(o instanceof Outlink))
+ return false;
+ Outlink other = (Outlink) o;
+ return this.toUrl.equals(other.toUrl) && this.anchor.equals(other.anchor);
+ }
+
+ public String toString() {
+ StringBuffer repr = new StringBuffer("toUrl: ");
+ repr.append(toUrl);
+ repr.append(" anchor: ");
+ repr.append(anchor);
+ if (md != null && !md.isEmpty()) {
+ for (Entry<Writable, Writable> e : md.entrySet()) {
+ repr.append(" ");
+ repr.append(e.getKey());
+ repr.append(": ");
+ repr.append(e.getValue());
+ }
}
+ return repr.toString();
+ }
}
Modified: nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/OutlinkExtractor.java Thu Jan 29 05:38:59 2015
@@ -34,8 +34,8 @@ import org.apache.oro.text.regex.Perl5Co
import org.apache.oro.text.regex.Perl5Matcher;
/**
- * Extractor to extract {@link org.apache.nutch.parse.Outlink}s
- * / URLs from plain text using Regular Expressions.
+ * Extractor to extract {@link org.apache.nutch.parse.Outlink}s / URLs from
+ * plain text using Regular Expressions.
*
* @see <a
* href="http://wiki.java.net/bin/view/Javapedia/RegularExpressions">Comparison
@@ -48,23 +48,26 @@ import org.apache.oro.text.regex.Perl5Ma
* @since 0.7
*/
public class OutlinkExtractor {
- private static final Logger LOG = LoggerFactory.getLogger(OutlinkExtractor.class);
+ private static final Logger LOG = LoggerFactory
+ .getLogger(OutlinkExtractor.class);
/**
* Regex pattern to get URLs within a plain text.
*
* @see <a
* href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html
+
* </a>
*/
- private static final String URL_PATTERN =
- "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
+ private static final String URL_PATTERN = "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
/**
- * Extracts <code>Outlink</code> from given plain text.
- * Applying this method to non-plain-text can result in extremely lengthy
- * runtimes for parasitic cases (postscript is a known example).
- * @param plainText the plain text from wich URLs should be extracted.
+ * Extracts <code>Outlink</code> from given plain text. Applying this method
+ * to non-plain-text can result in extremely lengthy runtimes for parasitic
+ * cases (postscript is a known example).
+ *
+ * @param plainText
+ * the plain text from wich URLs should be extracted.
*
* @return Array of <code>Outlink</code>s within found in plainText
*/
@@ -73,15 +76,18 @@ public class OutlinkExtractor {
}
/**
- * Extracts <code>Outlink</code> from given plain text and adds anchor
- * to the extracted <code>Outlink</code>s
+ * Extracts <code>Outlink</code> from given plain text and adds anchor to the
+ * extracted <code>Outlink</code>s
*
- * @param plainText the plain text from wich URLs should be extracted.
- * @param anchor the anchor of the url
+ * @param plainText
+ * the plain text from wich URLs should be extracted.
+ * @param anchor
+ * the anchor of the url
*
* @return Array of <code>Outlink</code>s within found in plainText
*/
- public static Outlink[] getOutlinks(final String plainText, String anchor, Configuration conf) {
+ public static Outlink[] getOutlinks(final String plainText, String anchor,
+ Configuration conf) {
long start = System.currentTimeMillis();
final List<Outlink> outlinks = new ArrayList<Outlink>();
@@ -97,11 +103,11 @@ public class OutlinkExtractor {
MatchResult result;
String url;
- //loop the matches
+ // loop the matches
while (matcher.contains(input, pattern)) {
// if this is taking too long, stop matching
- // (SHOULD really check cpu time used so that heavily loaded systems
- // do not unnecessarily hit this limit.)
+ // (SHOULD really check cpu time used so that heavily loaded systems
+ // do not unnecessarily hit this limit.)
if (System.currentTimeMillis() - start >= 60000L) {
if (LOG.isWarnEnabled()) {
LOG.warn("Time limit exceeded for getOutLinks");
@@ -117,13 +123,16 @@ public class OutlinkExtractor {
}
}
} catch (Exception ex) {
- // if the matcher fails (perhaps a malformed URL) we just log it and move on
- if (LOG.isErrorEnabled()) { LOG.error("getOutlinks", ex); }
+ // if the matcher fails (perhaps a malformed URL) we just log it and move
+ // on
+ if (LOG.isErrorEnabled()) {
+ LOG.error("getOutlinks", ex);
+ }
}
final Outlink[] retval;
- //create array of the Outlinks
+ // create array of the Outlinks
if (outlinks != null && outlinks.size() > 0) {
retval = outlinks.toArray(new Outlink[0]);
} else {
@@ -132,7 +141,6 @@ public class OutlinkExtractor {
return retval;
}
-
/**
* Extracts outlinks from a plain text. <br />
@@ -162,7 +170,7 @@ public class OutlinkExtractor {
// url = re.getParen(0);
//
// if (LOG.isTraceEnabled()) {
- // LOG.trace("Extracted url: " + url);
+ // LOG.trace("Extracted url: " + url);
// }
//
// try {
@@ -192,9 +200,8 @@ public class OutlinkExtractor {
}
/**
- * Extracts outlinks from a plain text.
- * </p>
- * This Method takes the JDK5 Regexp API.
+ * Extracts outlinks from a plain text. </p> This Method takes the JDK5 Regexp
+ * API.
*
* @param plainText
*
@@ -243,5 +250,5 @@ public class OutlinkExtractor {
//
// return retval;
}
-
+
}
Modified: nutch/trunk/src/java/org/apache/nutch/parse/Parse.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/Parse.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/Parse.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/Parse.java Thu Jan 29 05:38:59 2015
@@ -17,18 +17,22 @@
package org.apache.nutch.parse;
-/** The result of parsing a page's raw content.
+/**
+ * The result of parsing a page's raw content.
+ *
* @see Parser#getParse(Content)
*/
public interface Parse {
-
- /** The textual content of the page. This is indexed, searched, and used when
- * generating snippets.*/
+
+ /**
+ * The textual content of the page. This is indexed, searched, and used when
+ * generating snippets.
+ */
String getText();
/** Other data extracted from the page. */
ParseData getData();
-
+
/** Indicates if the parse is coming from a url or a sub-url */
boolean isCanonical();
}
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseCallable.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseCallable.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseCallable.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseCallable.java Thu Jan 29 05:38:59 2015
@@ -24,7 +24,7 @@ import org.apache.nutch.protocol.Content
class ParseCallable implements Callable<ParseResult> {
private Parser p;
private Content content;
-
+
public ParseCallable(Parser p, Content content) {
this.p = p;
this.content = content;
@@ -33,5 +33,5 @@ class ParseCallable implements Callable<
@Override
public ParseResult call() throws Exception {
return p.getParse(content);
- }
+ }
}
\ No newline at end of file
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java Thu Jan 29 05:38:59 2015
@@ -30,8 +30,9 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.util.NutchConfiguration;
-
-/** Data extracted from a page's content.
+/**
+ * Data extracted from a page's content.
+ *
* @see Parse#getData()
*/
public final class ParseData extends VersionedWritable {
@@ -45,19 +46,19 @@ public final class ParseData extends Ver
private Metadata parseMeta;
private ParseStatus status;
private byte version = VERSION;
-
+
public ParseData() {
contentMeta = new Metadata();
parseMeta = new Metadata();
}
public ParseData(ParseStatus status, String title, Outlink[] outlinks,
- Metadata contentMeta) {
+ Metadata contentMeta) {
this(status, title, outlinks, contentMeta, new Metadata());
}
-
+
public ParseData(ParseStatus status, String title, Outlink[] outlinks,
- Metadata contentMeta, Metadata parseMeta) {
+ Metadata contentMeta, Metadata parseMeta) {
this.status = status;
this.title = title;
this.outlinks = outlinks;
@@ -70,25 +71,34 @@ public final class ParseData extends Ver
//
/** The status of parsing the page. */
- public ParseStatus getStatus() { return status; }
-
+ public ParseStatus getStatus() {
+ return status;
+ }
+
/** The title of the page. */
- public String getTitle() { return title; }
+ public String getTitle() {
+ return title;
+ }
/** The outlinks of the page. */
- public Outlink[] getOutlinks() { return outlinks; }
+ public Outlink[] getOutlinks() {
+ return outlinks;
+ }
/** The original Metadata retrieved from content */
- public Metadata getContentMeta() { return contentMeta; }
+ public Metadata getContentMeta() {
+ return contentMeta;
+ }
/**
- * Other content properties.
- * This is the place to find format-specific properties.
- * Different parser implementations for different content types will populate
- * this differently.
+ * Other content properties. This is the place to find format-specific
+ * properties. Different parser implementations for different content types
+ * will populate this differently.
*/
- public Metadata getParseMeta() { return parseMeta; }
-
+ public Metadata getParseMeta() {
+ return parseMeta;
+ }
+
public void setParseMeta(Metadata parseMeta) {
this.parseMeta = parseMeta;
}
@@ -96,11 +106,12 @@ public final class ParseData extends Ver
public void setOutlinks(Outlink[] outlinks) {
this.outlinks = outlinks;
}
-
+
/**
- * Get a metadata single value.
- * This method first looks for the metadata value in the parse metadata. If no
- * value is found it the looks for the metadata in the content metadata.
+ * Get a metadata single value. This method first looks for the metadata value
+ * in the parse metadata. If no value is found it the looks for the metadata
+ * in the content metadata.
+ *
* @see #getContentMeta()
* @see #getParseMeta()
*/
@@ -111,12 +122,14 @@ public final class ParseData extends Ver
}
return value;
}
-
+
//
// Writable methods
//
- public byte getVersion() { return version; }
+ public byte getVersion() {
+ return version;
+ }
public final void readFields(DataInput in) throws IOException {
@@ -125,16 +138,16 @@ public final class ParseData extends Ver
if (version != VERSION)
throw new VersionMismatchException(VERSION, version);
status = ParseStatus.read(in);
- title = Text.readString(in); // read title
+ title = Text.readString(in); // read title
- int numOutlinks = in.readInt();
+ int numOutlinks = in.readInt();
outlinks = new Outlink[numOutlinks];
for (int i = 0; i < numOutlinks; i++) {
outlinks[i] = Outlink.read(in);
}
-
+
if (version < 3) {
- int propertyCount = in.readInt(); // read metadata
+ int propertyCount = in.readInt(); // read metadata
contentMeta.clear();
for (int i = 0; i < propertyCount; i++) {
contentMeta.add(Text.readString(in), Text.readString(in));
@@ -150,15 +163,15 @@ public final class ParseData extends Ver
}
public final void write(DataOutput out) throws IOException {
- out.writeByte(VERSION); // write version
- status.write(out); // write status
- Text.writeString(out, title); // write title
+ out.writeByte(VERSION); // write version
+ status.write(out); // write status
+ Text.writeString(out, title); // write title
- out.writeInt(outlinks.length); // write outlinks
+ out.writeInt(outlinks.length); // write outlinks
for (int i = 0; i < outlinks.length; i++) {
outlinks[i].write(out);
}
- contentMeta.write(out); // write content metadata
+ contentMeta.write(out); // write content metadata
parseMeta.write(out);
}
@@ -175,38 +188,36 @@ public final class ParseData extends Ver
public boolean equals(Object o) {
if (!(o instanceof ParseData))
return false;
- ParseData other = (ParseData)o;
- return
- this.status.equals(other.status) &&
- this.title.equals(other.title) &&
- Arrays.equals(this.outlinks, other.outlinks) &&
- this.contentMeta.equals(other.contentMeta) &&
- this.parseMeta.equals(other.parseMeta);
+ ParseData other = (ParseData) o;
+ return this.status.equals(other.status) && this.title.equals(other.title)
+ && Arrays.equals(this.outlinks, other.outlinks)
+ && this.contentMeta.equals(other.contentMeta)
+ && this.parseMeta.equals(other.parseMeta);
}
public String toString() {
StringBuffer buffer = new StringBuffer();
- buffer.append("Version: " + version + "\n" );
- buffer.append("Status: " + status + "\n" );
- buffer.append("Title: " + title + "\n" );
+ buffer.append("Version: " + version + "\n");
+ buffer.append("Status: " + status + "\n");
+ buffer.append("Title: " + title + "\n");
if (outlinks != null) {
- buffer.append("Outlinks: " + outlinks.length + "\n" );
+ buffer.append("Outlinks: " + outlinks.length + "\n");
for (int i = 0; i < outlinks.length; i++) {
buffer.append(" outlink: " + outlinks[i] + "\n");
}
}
- buffer.append("Content Metadata: " + contentMeta + "\n" );
- buffer.append("Parse Metadata: " + parseMeta + "\n" );
+ buffer.append("Content Metadata: " + contentMeta + "\n");
+ buffer.append("Parse Metadata: " + parseMeta + "\n");
return buffer.toString();
}
public static void main(String argv[]) throws Exception {
String usage = "ParseData (-local | -dfs <namenode:port>) recno segment";
-
+
if (argv.length < 3) {
System.out.println("usage:" + usage);
return;
@@ -214,13 +225,12 @@ public final class ParseData extends Ver
Options opts = new Options();
Configuration conf = NutchConfiguration.create();
-
- GenericOptionsParser parser =
- new GenericOptionsParser(conf, opts, argv);
-
+
+ GenericOptionsParser parser = new GenericOptionsParser(conf, opts, argv);
+
String[] remainingArgs = parser.getRemainingArgs();
FileSystem fs = FileSystem.get(conf);
-
+
try {
int recno = Integer.parseInt(remainingArgs[0]);
String segment = remainingArgs[1];
Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java Thu Jan 29 05:38:59 2015
@@ -20,8 +20,9 @@ package org.apache.nutch.parse;
import java.io.*;
import org.apache.hadoop.io.*;
-
-/** The result of parsing a page's raw content.
+/**
+ * The result of parsing a page's raw content.
+ *
* @see Parser#getParse(Content)
*/
public class ParseImpl implements Parse, Writable {
@@ -29,7 +30,8 @@ public class ParseImpl implements Parse,
private ParseData data;
private boolean isCanonical;
- public ParseImpl() {}
+ public ParseImpl() {
+ }
public ParseImpl(Parse parse) {
this(new ParseText(parse.getText()), parse.getData(), true);
@@ -38,7 +40,7 @@ public class ParseImpl implements Parse,
public ParseImpl(String text, ParseData data) {
this(new ParseText(text), data, true);
}
-
+
public ParseImpl(ParseText text, ParseData data) {
this(text, data, true);
}
@@ -49,12 +51,18 @@ public class ParseImpl implements Parse,
this.isCanonical = isCanonical;
}
- public String getText() { return text.getText(); }
+ public String getText() {
+ return text.getText();
+ }
- public ParseData getData() { return data; }
+ public ParseData getData() {
+ return data;
+ }
+
+ public boolean isCanonical() {
+ return isCanonical;
+ }
- public boolean isCanonical() { return isCanonical; }
-
public final void write(DataOutput out) throws IOException {
out.writeBoolean(isCanonical);
text.write(out);