You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by do...@apache.org on 2007/07/11 12:54:39 UTC
svn commit: r555237 - in /lucene/nutch/trunk: ./
src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/net/
src/java/org/apache/nutch/parse/
src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/
src/plugin/parse-ext/src/java/org/apache/nutc...
Author: dogacan
Date: Wed Jul 11 03:54:37 2007
New Revision: 555237
URL: http://svn.apache.org/viewvc?view=rev&rev=555237
Log:
NUTCH-505 - Outlink urls should be validated.
Added:
lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlValidator.java
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Wed Jul 11 03:54:37 2007
@@ -81,6 +81,8 @@
26. NUTCH-503 - Generator exits incorrectly for small fetchlists.
(Vishal Shah via dogacan)
+27. NUTCH-505 - Outlink urls should be validated. (dogacan)
+
Release 0.9 - 2007-04-02
1. Changed log4j confiquration to log to stdout on commandline
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutput.java Wed Jul 11 03:54:37 2007
@@ -23,15 +23,12 @@
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.parse.*;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configurable;
/* An entry in the fetcher's output. */
-public final class FetcherOutput implements Writable, Configurable {
+public final class FetcherOutput implements Writable {
private CrawlDatum crawlDatum;
private Content content;
private ParseImpl parse;
- private Configuration conf;
public FetcherOutput() {}
@@ -45,7 +42,7 @@
public final void readFields(DataInput in) throws IOException {
this.crawlDatum = CrawlDatum.read(in);
this.content = in.readBoolean() ? Content.read(in) : null;
- this.parse = in.readBoolean() ? ParseImpl.read(in, this.conf) : null;
+ this.parse = in.readBoolean() ? ParseImpl.read(in) : null;
}
public final void write(DataOutput out) throws IOException {
@@ -79,14 +76,6 @@
StringBuffer buffer = new StringBuffer();
buffer.append("CrawlDatum: " + crawlDatum+"\n" );
return buffer.toString();
- }
-
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
-
- public Configuration getConf() {
- return this.conf;
}
}
Added: lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlValidator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlValidator.java?view=auto&rev=555237
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlValidator.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/net/UrlValidator.java Wed Jul 11 03:54:37 2007
@@ -0,0 +1,377 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net;
+
+import org.apache.oro.text.perl.Perl5Util;
+
+/**
+ * <p>Validates URLs.</p>
+ *
+ * <p>Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date: 03/07/02,
+ * http://javascript.internet.com. However, this validation now bears little resemblance
+ * to the php original.</p>
+ * <pre>
+ * Example of usage:
+ * UrlValidator urlValidator = UrlValidator.get();
+ * if (urlValidator.isValid("ftp://foo.bar.com/")) {
+ * System.out.println("url is valid");
+ * } else {
+ * System.out.println("url is invalid");
+ * }
+ *
+ * prints out "url is valid"
+ * </pre>
+ *
+ * <p>Based on UrlValidator code from Apache commons-validator.</p>
+ *
+ * @see
+ * <a href='http://www.ietf.org/rfc/rfc2396.txt' >
+ * Uniform Resource Identifiers (URI): Generic Syntax
+ * </a>
+ *
+ */
+public class UrlValidator {
+
+ private static final String ALPHA_CHARS = "a-zA-Z";
+
+ private static final String ALPHA_NUMERIC_CHARS = ALPHA_CHARS + "\\d";
+
+ private static final String SPECIAL_CHARS = ";/@&=,.?:+$";
+
+ private static final String VALID_CHARS = "[^\\s" + SPECIAL_CHARS + "]";
+
+ private static final String SCHEME_CHARS = ALPHA_CHARS;
+
+ // Drop numeric, and "+-." for now
+ private static final String AUTHORITY_CHARS = ALPHA_NUMERIC_CHARS + "\\-\\.";
+
+ private static final String ATOM = VALID_CHARS + '+';
+
+ /**
+ * This expression derived/taken from the BNF for URI (RFC2396).
+ */
+ private static final String URL_PATTERN =
+ "/^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?/";
+ // 12 3 4 5 6 7 8 9
+
+ /**
+ * Schema/Protocol (ie. http:, ftp:, file:, etc).
+ */
+ private static final int PARSE_URL_SCHEME = 2;
+
+ /**
+ * Includes hostname/ip and port number.
+ */
+ private static final int PARSE_URL_AUTHORITY = 4;
+
+ private static final int PARSE_URL_PATH = 5;
+
+ private static final int PARSE_URL_QUERY = 7;
+
+ /**
+ * Protocol (ie. http:, ftp:,https:).
+ */
+ private static final String SCHEME_PATTERN = "/^[" + SCHEME_CHARS + "]/";
+
+ private static final String AUTHORITY_PATTERN =
+ "/^([" + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?/";
+ // 1 2 3 4
+
+ private static final int PARSE_AUTHORITY_HOST_IP = 1;
+
+ private static final int PARSE_AUTHORITY_PORT = 2;
+
+ /**
+ * Should always be empty.
+ */
+ private static final int PARSE_AUTHORITY_EXTRA = 3;
+
+ private static final String PATH_PATTERN = "/^(/[-\\w:@&?=+,.!/~*'%$_;]*)?$/";
+
+ private static final String QUERY_PATTERN = "/^(.*)$/";
+
+ private static final String LEGAL_ASCII_PATTERN = "/^[\\000-\\177]+$/";
+
+ private static final String IP_V4_DOMAIN_PATTERN =
+ "/^(\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})$/";
+
+ private static final String DOMAIN_PATTERN =
+ "/^" + ATOM + "(\\." + ATOM + ")*$/";
+
+ private static final String PORT_PATTERN = "/^:(\\d{1,5})$/";
+
+ private static final String ATOM_PATTERN = "/(" + ATOM + ")/";
+
+ private static final String ALPHA_PATTERN = "/^[" + ALPHA_CHARS + "]/";
+
+ private static final UrlValidator VALIDATOR = new UrlValidator();
+
+ private UrlValidator() {
+ }
+
+ public static UrlValidator get() {
+ return VALIDATOR;
+ }
+
+ /**
+ * <p>Checks if a field has a valid url address.</p>
+ *
+ * @param value The value validation is being performed on. A <code>null</code>
+ * value is considered invalid.
+ * @return true if the url is valid.
+ */
+ public boolean isValid(String value) {
+ if (value == null) {
+ return false;
+ }
+
+ Perl5Util matchUrlPat = new Perl5Util();
+ Perl5Util matchAsciiPat = new Perl5Util();
+
+ if (!matchAsciiPat.match(LEGAL_ASCII_PATTERN, value)) {
+ return false;
+ }
+
+ // Check the whole url address structure
+ if (!matchUrlPat.match(URL_PATTERN, value)) {
+ return false;
+ }
+
+ if (!isValidScheme(matchUrlPat.group(PARSE_URL_SCHEME))) {
+ return false;
+ }
+
+ if (!isValidAuthority(matchUrlPat.group(PARSE_URL_AUTHORITY))) {
+ return false;
+ }
+
+ if (!isValidPath(matchUrlPat.group(PARSE_URL_PATH))) {
+ return false;
+ }
+
+ if (!isValidQuery(matchUrlPat.group(PARSE_URL_QUERY))) {
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * Validate scheme. If schemes[] was initialized to a non null,
+ * then only those scheme's are allowed. Note this is slightly different
+ * than for the constructor.
+ * @param scheme The scheme to validate. A <code>null</code> value is considered
+ * invalid.
+ * @return true if valid.
+ */
+ protected boolean isValidScheme(String scheme) {
+ if (scheme == null) {
+ return false;
+ }
+
+ Perl5Util schemeMatcher = new Perl5Util();
+ if (!schemeMatcher.match(SCHEME_PATTERN, scheme)) {
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * Returns true if the authority is properly formatted. An authority is the combination
+ * of hostname and port. A <code>null</code> authority value is considered invalid.
+ * @param authority Authority value to validate.
+ * @return true if authority (hostname and port) is valid.
+ */
+ protected boolean isValidAuthority(String authority) {
+ if (authority == null) {
+ return false;
+ }
+
+ Perl5Util authorityMatcher = new Perl5Util();
+ Perl5Util matchIPV4Pat = new Perl5Util();
+
+ if (!authorityMatcher.match(AUTHORITY_PATTERN, authority)) {
+ return false;
+ }
+
+ boolean ipV4Address = false;
+ boolean hostname = false;
+ // check if authority is IP address or hostname
+ String hostIP = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
+ ipV4Address = matchIPV4Pat.match(IP_V4_DOMAIN_PATTERN, hostIP);
+
+ if (ipV4Address) {
+ // this is an IP address so check components
+ for (int i = 1; i <= 4; i++) {
+ String ipSegment = matchIPV4Pat.group(i);
+ if (ipSegment == null || ipSegment.length() <= 0) {
+ return false;
+ }
+
+ try {
+ if (Integer.parseInt(ipSegment) > 255) {
+ return false;
+ }
+ } catch(NumberFormatException e) {
+ return false;
+ }
+
+ }
+ } else {
+ // Domain is hostname name
+ Perl5Util domainMatcher = new Perl5Util();
+ hostname = domainMatcher.match(DOMAIN_PATTERN, hostIP);
+ }
+
+ // rightmost hostname will never start with a digit.
+ if (hostname) {
+ // LOW-TECH FIX FOR VALIDATOR-202
+ // TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203
+ char[] chars = hostIP.toCharArray();
+ int size = 1;
+ for(int i=0; i<chars.length; i++) {
+ if(chars[i] == '.') {
+ size++;
+ }
+ }
+ String[] domainSegment = new String[size];
+ boolean match = true;
+ int segCount = 0;
+ int segLen = 0;
+ Perl5Util atomMatcher = new Perl5Util();
+
+ while (match) {
+ match = atomMatcher.match(ATOM_PATTERN, hostIP);
+ if (match) {
+ domainSegment[segCount] = atomMatcher.group(1);
+ segLen = domainSegment[segCount].length() + 1;
+ hostIP = (segLen >= hostIP.length()) ? ""
+ : hostIP.substring(segLen);
+ segCount++;
+ }
+ }
+ String topLevel = domainSegment[segCount - 1];
+ if (topLevel.length() < 2 || topLevel.length() > 4) {
+ return false;
+ }
+
+ // First letter of top level must be a alpha
+ Perl5Util alphaMatcher = new Perl5Util();
+ if (!alphaMatcher.match(ALPHA_PATTERN, topLevel.substring(0, 1))) {
+ return false;
+ }
+
+ // Make sure there's a host name preceding the authority.
+ if (segCount < 2) {
+ return false;
+ }
+ }
+
+ if (!hostname && !ipV4Address) {
+ return false;
+ }
+
+ String port = authorityMatcher.group(PARSE_AUTHORITY_PORT);
+ if (port != null) {
+ Perl5Util portMatcher = new Perl5Util();
+ if (!portMatcher.match(PORT_PATTERN, port)) {
+ return false;
+ }
+ }
+
+ String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
+ if (!isBlankOrNull(extra)) {
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * <p>Checks if the field isn't null and length of the field is greater
+ * than zero not including whitespace.</p>
+ *
+ * @param value The value validation is being performed on.
+ * @return true if blank or null.
+ */
+ private boolean isBlankOrNull(String value) {
+ return ((value == null) || (value.trim().length() == 0));
+ }
+
+ /**
+ * Returns true if the path is valid. A <code>null</code> value is considered invalid.
+ * @param path Path value to validate.
+ * @return true if path is valid.
+ */
+ protected boolean isValidPath(String path) {
+ if (path == null) {
+ return false;
+ }
+
+ Perl5Util pathMatcher = new Perl5Util();
+
+ if (!pathMatcher.match(PATH_PATTERN, path)) {
+ return false;
+ }
+
+ int slash2Count = countToken("//", path);
+
+ int slashCount = countToken("/", path);
+ int dot2Count = countToken("..", path);
+ if (dot2Count > 0) {
+ if ((slashCount - slash2Count - 1) <= dot2Count) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ /**
+ * Returns true if the query is null or it's a properly formatted query string.
+ * @param query Query value to validate.
+ * @return true if query is valid.
+ */
+ protected boolean isValidQuery(String query) {
+ if (query == null) {
+ return true;
+ }
+
+ Perl5Util queryMatcher = new Perl5Util();
+ return queryMatcher.match(QUERY_PATTERN, query);
+ }
+
+ /**
+ * Returns the number of times the token appears in the target.
+ * @param token Token value to be counted.
+ * @param target Target value to count tokens in.
+ * @return the number of tokens.
+ */
+ protected int countToken(String token, String target) {
+ int tokenIndex = 0;
+ int count = 0;
+ while (tokenIndex != -1) {
+ tokenIndex = target.indexOf(token, tokenIndex);
+ if (tokenIndex > -1) {
+ tokenIndex++;
+ count++;
+ }
+ }
+ return count;
+ }
+}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseData.java Wed Jul 11 03:54:37 2007
@@ -21,9 +21,8 @@
import java.util.*;
import org.apache.hadoop.io.*;
-import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configurable;
+import org.apache.hadoop.fs.*;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.util.NutchConfiguration;
@@ -32,7 +31,7 @@
/** Data extracted from a page's content.
* @see Parse#getData()
*/
-public final class ParseData extends VersionedWritable implements Configurable {
+public final class ParseData extends VersionedWritable {
public static final String DIR_NAME = "parse_data";
private final static byte VERSION = 5;
@@ -42,13 +41,8 @@
private Metadata contentMeta;
private Metadata parseMeta;
private ParseStatus status;
- private Configuration conf;
private byte version = VERSION;
- // TODO mb@media-style.com: should we really implement Configurable or should we add the
- // parameter Configuration to the default-constructor. NOTE: The test
- // TestWriteable instantiates ParseData with Class.newInstance() -> the default
- // constructor is called -> conf is null. The programmer which use this object may not forget to set the conf.
public ParseData() {}
public ParseData(ParseStatus status, String title, Outlink[] outlinks,
@@ -123,19 +117,11 @@
status = ParseStatus.read(in);
title = Text.readString(in); // read title
- int totalOutlinks = in.readInt(); // read outlinks
- int maxOutlinksPerPage = this.conf.getInt("db.max.outlinks.per.page", 100);
- int outlinksToRead = totalOutlinks;
- if (maxOutlinksPerPage >= 0) {
- outlinksToRead = Math.min(maxOutlinksPerPage, totalOutlinks);
- }
- outlinks = new Outlink[outlinksToRead];
- for (int i = 0; i < outlinksToRead; i++) {
+ int numOutlinks = in.readInt();
+ outlinks = new Outlink[numOutlinks];
+ for (int i = 0; i < numOutlinks; i++) {
outlinks[i] = Outlink.read(in);
}
- for (int i = outlinksToRead; i < totalOutlinks; i++) {
- Outlink.skip(in);
- }
if (version < 3) {
int propertyCount = in.readInt(); // read metadata
@@ -239,11 +225,4 @@
}
}
- public void setConf(Configuration conf) {
- this.conf = conf;
- }
-
- public Configuration getConf() {
- return this.conf;
- }
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseImpl.java Wed Jul 11 03:54:37 2007
@@ -19,18 +19,15 @@
import java.io.*;
import org.apache.hadoop.io.*;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configurable;
/** The result of parsing a page's raw content.
* @see Parser#getParse(Content)
*/
-public class ParseImpl implements Parse, Writable, Configurable {
+public class ParseImpl implements Parse, Writable {
private ParseText text;
private ParseData data;
private boolean isCanonical;
- private Configuration conf;
public ParseImpl() {}
@@ -70,25 +67,13 @@
text.readFields(in);
data = new ParseData();
- data.setConf(this.conf);
data.readFields(in);
}
- public static ParseImpl read(DataInput in, Configuration conf) throws IOException {
+ public static ParseImpl read(DataInput in) throws IOException {
ParseImpl parseImpl = new ParseImpl();
- parseImpl.setConf(conf);
parseImpl.readFields(in);
return parseImpl;
}
-
- public void setConf(Configuration conf) {
- this.conf = conf;
-
- }
-
- public Configuration getConf() {
- return this.conf;
- }
-
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Wed Jul 11 03:54:37 2007
@@ -45,7 +45,6 @@
public class ParseOutputFormat implements OutputFormat {
private static final Log LOG = LogFactory.getLog(ParseOutputFormat.class);
- private URLNormalizers urlNormalizers;
private URLFilters filters;
private ScoringFilters scfilters;
@@ -80,11 +79,12 @@
public RecordWriter getRecordWriter(FileSystem fs, JobConf job,
String name, Progressable progress) throws IOException {
- this.urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
this.filters = new URLFilters(job);
this.scfilters = new ScoringFilters(job);
+ final UrlValidator validator = UrlValidator.get();
final float interval = job.getFloat("db.default.fetch.interval", 30f);
final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false);
+ final int maxOutlinks = job.getInt("db.max.outlinks.per.page", 100);
Path text =
new Path(new Path(job.getOutputPath(), ParseText.DIR_NAME), name);
@@ -132,6 +132,7 @@
// collect outlinks for subsequent db update
Outlink[] links = parseData.getOutlinks();
+ int outlinksToStore = Math.min(maxOutlinks, links.length);
if (ignoreExternalLinks) {
try {
fromHost = new URL(fromUrl).getHost().toLowerCase();
@@ -142,29 +143,33 @@
fromHost = null;
}
- String[] toUrls = new String[links.length];
int validCount = 0;
- for (int i = 0; i < links.length; i++) {
+ CrawlDatum adjust = null;
+ List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, CrawlDatum>>();
+ List<Outlink> outlinkList = new ArrayList<Outlink>();
+ for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
String toUrl = links[i].getToUrl();
+ if (!validator.isValid(toUrl)) {
+ continue;
+ }
try {
- toUrl = urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_OUTLINK); // normalize the url
+ // normalizing here is not necessary since outlinks
+ // are already normalized in Outlink's constructor
toUrl = filters.filter(toUrl); // filter the url
+ if (toUrl == null) {
+ continue;
+ }
} catch (Exception e) {
- toUrl = null;
+ continue;
}
+
// ignore links to self (or anchors within the page)
- if (fromUrl.equals(toUrl)) toUrl = null;
- if (toUrl != null) validCount++;
- toUrls[i] = toUrl;
- }
- CrawlDatum adjust = null;
- List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, CrawlDatum>>();
- // compute score contributions and adjustment to the original score
- for (int i = 0; i < toUrls.length; i++) {
- if (toUrls[i] == null) continue;
+ if (fromUrl.equals(toUrl)) {
+ continue;
+ }
if (ignoreExternalLinks) {
try {
- toHost = new URL(toUrls[i]).getHost().toLowerCase();
+ toHost = new URL(toUrl).getHost().toLowerCase();
} catch (MalformedURLException e) {
toHost = null;
}
@@ -173,7 +178,7 @@
}
}
CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval);
- Text targetUrl = new Text(toUrls[i]);
+ Text targetUrl = new Text(toUrl);
try {
scfilters.initialScore(targetUrl, target);
} catch (ScoringFilterException e) {
@@ -183,8 +188,11 @@
}
targets.add(new SimpleEntry(targetUrl, target));
+ outlinkList.add(links[i]);
+ validCount++;
}
try {
+ // compute score contributions and adjustment to the original score
adjust = scfilters.distributeScoreToOutlinks((Text)key, parseData,
targets, null, links.length);
} catch (ScoringFilterException e) {
@@ -195,6 +203,10 @@
}
if (adjust != null) crawlOut.append(key, adjust);
+ Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList.size()]);
+ parseData = new ParseData(parseData.getStatus(), parseData.getTitle(),
+ filteredLinks, parseData.getContentMeta(),
+ parseData.getParseMeta());
dataOut.append(key, parseData);
if (!parse.isCanonical()) {
CrawlDatum datum = new CrawlDatum();
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseStatus.java Wed Jul 11 03:54:37 2007
@@ -258,7 +258,6 @@
public EmptyParseImpl(ParseStatus status, Configuration conf) {
data = new ParseData(status, "", new Outlink[0],
new Metadata(), new Metadata());
- data.setConf(conf);
}
public ParseData getData() {
Modified: lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java (original)
+++ lucene/nutch/trunk/src/plugin/lib-parsems/src/java/org/apache/nutch/parse/ms/MSBaseParser.java Wed Jul 11 03:54:37 2007
@@ -104,7 +104,6 @@
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
outlinks, content.getMetadata(),
metadata);
- parseData.setConf(this.conf);
return ParseResult.createParseResult(content.getUrl(),
new ParseImpl(text, parseData));
}
Modified: lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java Wed Jul 11 03:54:37 2007
@@ -134,7 +134,6 @@
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
outlinks, content.getMetadata());
- parseData.setConf(this.conf);
return ParseResult.createParseResult(content.getUrl(),
new ParseImpl(text, parseData));
}
Modified: lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java Wed Jul 11 03:54:37 2007
@@ -213,7 +213,6 @@
}
ParseData parseData = new ParseData(status, title, outlinks,
content.getMetadata(), metadata);
- parseData.setConf(this.conf);
ParseResult parseResult = ParseResult.createParseResult(content.getUrl(),
new ParseImpl(text, parseData));
Modified: lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Wed Jul 11 03:54:37 2007
@@ -90,7 +90,6 @@
ParseData parseData = new ParseData(status, title, newlinks,
parse.getData().getContentMeta(),
parse.getData().getParseMeta());
- parseData.setConf(this.conf);
// replace original parse obj with new one
parseResult.put(content.getUrl(), new ParseText(text), parseData);
@@ -170,7 +169,6 @@
}
ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
c.getMetadata());
- pd.setConf(this.conf);
return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
}
Modified: lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-pdf/src/java/org/apache/nutch/parse/pdf/PdfParser.java Wed Jul 11 03:54:37 2007
@@ -153,7 +153,6 @@
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title,
outlinks, content.getMetadata(),
metadata);
- parseData.setConf(this.conf);
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
// any filter?
//return HtmlParseFilters.filter(content, parse, root);
Modified: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java Wed Jul 11 03:54:37 2007
@@ -199,7 +199,6 @@
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
contentTitle.toString(), outlinks, content.getMetadata());
- parseData.setConf(this.conf);
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(indexText.toString(), parseData));
}
Modified: lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-text/src/java/org/apache/nutch/parse/text/TextParser.java Wed Jul 11 03:54:37 2007
@@ -53,7 +53,6 @@
ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "",
OutlinkExtractor.getOutlinks(text, getConf()), content.getMetadata());
- parseData.setConf(this.conf);
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));
}
Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java Wed Jul 11 03:54:37 2007
@@ -100,7 +100,6 @@
final ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS,
resultTitle, outlinks,
content.getMetadata());
- parseData.setConf(this.conf);
if (LOG.isTraceEnabled()) { LOG.trace("Zip file parsed sucessfully !!"); }
return ParseResult.createParseResult(content.getUrl(), new ParseImpl(resultText, parseData));
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java?view=diff&rev=555237&r1=555236&r2=555237
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/parse/TestParseData.java Wed Jul 11 03:54:37 2007
@@ -47,9 +47,8 @@
metaData.add("Charset", "UTF-8");
ParseData r = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
- r.setConf(conf);
- WritableTestUtils.testWritable(r, conf);
+ WritableTestUtils.testWritable(r, null);
}
public void testMaxOutlinks() throws Exception {
@@ -61,22 +60,7 @@
"Max Outlinks Title",
outlinks,
new Metadata());
- Configuration conf = NutchConfiguration.create();
- // No Outlinks
- conf.setInt("db.max.outlinks.per.page", 0);
- ParseData data = (ParseData) WritableTestUtils.writeRead(original, conf);
- assertEquals(0, data.getOutlinks().length);
- // Only 100 Outlinks
- conf.setInt("db.max.outlinks.per.page", 100);
- data = (ParseData) WritableTestUtils.writeRead(original, conf);
- assertEquals(100, data.getOutlinks().length);
- // 256 Outlinks
- conf.setInt("db.max.outlinks.per.page", 256);
- data = (ParseData) WritableTestUtils.writeRead(original, conf);
- assertEquals(outlinks.length, data.getOutlinks().length);
- // All Outlinks
- conf.setInt("db.max.outlinks.per.page", -1);
- data = (ParseData) WritableTestUtils.writeRead(original, conf);
+ ParseData data = (ParseData) WritableTestUtils.writeRead(original, null);
assertEquals(outlinks.length, data.getOutlinks().length);
}
}