You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by jn...@apache.org on 2014/05/15 10:10:09 UTC
svn commit: r1594812 [1/5] - in /nutch/branches/2.x: ./ ivy/ src/gora/
src/java/org/apache/nutch/api/ src/java/org/apache/nutch/crawl/
src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/host/
src/java/org/apache/nutch/indexer/ src/java/org/ap...
Author: jnioche
Date: Thu May 15 08:10:07 2014
New Revision: 1594812
URL: http://svn.apache.org/r1594812
Log:
NUTCH-1714 Nutch 2.x upgrade to Gora 0.4
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/ivy/ivy.xml
nutch/branches/2.x/src/gora/host.avsc
nutch/branches/2.x/src/gora/webpage.avsc
nutch/branches/2.x/src/java/org/apache/nutch/api/DbReader.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorReducer.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/MD5Signature.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java
nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbUpdateReducer.java
nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java
nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/Mark.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/ParseStatus.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/ProtocolStatus.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/WebPage.java
nutch/branches/2.x/src/java/org/apache/nutch/tools/DmozParser.java
nutch/branches/2.x/src/java/org/apache/nutch/util/EncodingDetector.java
nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java
nutch/branches/2.x/src/java/org/apache/nutch/util/TableUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/util/WebPageWritable.java
nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java
nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
nutch/branches/2.x/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java
nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
nutch/branches/2.x/src/plugin/index-metadata/src/java/org/apache/nutch/indexer/metadata/MetadataIndexer.java
nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
nutch/branches/2.x/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java
nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java
nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java
nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java
nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java
nutch/branches/2.x/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java
nutch/branches/2.x/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/TestMetaTagsParser.java
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java
nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
nutch/branches/2.x/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
nutch/branches/2.x/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
nutch/branches/2.x/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java
nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestAdaptiveFetchSchedule.java
nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java
nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java
nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestURLPartitioner.java
nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java
nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java
nutch/branches/2.x/src/test/org/apache/nutch/util/CrawlTestUtil.java
nutch/branches/2.x/src/test/org/apache/nutch/util/TestEncodingDetector.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Thu May 15 08:10:07 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1714 Upgrade to Gora 0.4 (Alparslan Avcı via jnioche)
+
* NUTCH-1752 Cache robots.txt rules per protocol:host:port (snagel)
* NUTCH-1613 Timeouts in protocol-httpclient when crawling same host with >2 threads (brian44 via jnioche)
Modified: nutch/branches/2.x/ivy/ivy.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/ivy.xml?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/ivy/ivy.xml (original)
+++ nutch/branches/2.x/ivy/ivy.xml Thu May 15 08:10:07 2014
@@ -102,8 +102,7 @@
<!-- N.B. To use Gora SNAPSHOT's merely replace the 'ref' value with the SNAPSHOT version
and add changing="true" alongside the dependency declaration. An example has been
provided for the gora-core dependency as below -->
- <dependency org="org.apache.gora" name="gora-core" rev="0.3" conf="*->default"/>
- <!--dependency org="org.apache.gora" name="gora-core" rev="0.4-SNAPSHOT" conf="*->default" changing="true"/-->
+ <dependency org="org.apache.gora" name="gora-core" rev="0.4" conf="*->default"/>
<!-- Uncomment this to use SQL as Gora backend. It should be noted that the
gora-sql 0.1.1-incubating artifact is NOT compatable with gora-core 0.3. Users should
@@ -117,15 +116,16 @@
-->
<!-- Uncomment this to use HBase as Gora backend. -->
<!--
- <dependency org="org.apache.gora" name="gora-hbase" rev="0.3" conf="*->default" />
+ <dependency org="org.apache.gora" name="gora-hbase" rev="0.4" conf="*->default" />
-->
<!-- Uncomment this to use Accumulo as Gora backend. -->
<!--
- <dependency org="org.apache.gora" name="gora-accumulo" rev="0.3" conf="*->default" />
+ <dependency org="org.apache.gora" name="gora-accumulo" rev="0.4" conf="*->default" />
-->
<!-- Uncomment this to use Cassandra as Gora backend. -->
-
- <dependency org="org.apache.gora" name="gora-cassandra" rev="0.3" conf="*->default" />
+ <!--
+ <dependency org="org.apache.gora" name="gora-cassandra" rev="0.4" conf="*->default" />
+ -->
<!--global exclusion -->
Modified: nutch/branches/2.x/src/gora/host.avsc
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/gora/host.avsc?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/gora/host.avsc (original)
+++ nutch/branches/2.x/src/gora/host.avsc Thu May 15 08:10:07 2014
@@ -2,8 +2,8 @@
"type": "record",
"namespace": "org.apache.nutch.storage",
"fields": [
- {"name": "metadata", "type": {"type": "map", "values": "bytes"}},
- {"name": "outlinks", "type": {"type": "map", "values": "string"}},
- {"name": "inlinks", "type": {"type": "map", "values": "string"}}
+ {"name": "metadata", "type": {"type": "map", "values": "bytes"}, "default":{}},
+ {"name": "outlinks", "type": {"type": "map", "values": "string"}, "default":{}},
+ {"name": "inlinks", "type": {"type": "map", "values": "string"}, "default":{}}
]
}
Modified: nutch/branches/2.x/src/gora/webpage.avsc
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/gora/webpage.avsc?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/gora/webpage.avsc (original)
+++ nutch/branches/2.x/src/gora/webpage.avsc Thu May 15 08:10:07 2014
@@ -2,47 +2,47 @@
"type": "record",
"namespace": "org.apache.nutch.storage",
"fields": [
- {"name": "baseUrl", "type": "string"},
- {"name": "status", "type": "int"},
- {"name": "fetchTime", "type": "long"},
- {"name": "prevFetchTime", "type": "long"},
- {"name": "fetchInterval", "type": "int"},
- {"name": "retriesSinceFetch", "type": "int"},
- {"name": "modifiedTime", "type": "long"},
- {"name": "prevModifiedTime", "type": "long"},
- {"name": "protocolStatus", "type": {
+ {"name": "baseUrl", "type": ["null","string"], "default":null},
+ {"name": "status", "type": "int", "default":0},
+ {"name": "fetchTime", "type": "long", "default":0},
+ {"name": "prevFetchTime", "type": "long", "default":0},
+ {"name": "fetchInterval", "type": "int", "default":0},
+ {"name": "retriesSinceFetch", "type": "int", "default":0},
+ {"name": "modifiedTime", "type": "long", "default":0},
+ {"name": "prevModifiedTime", "type": "long", "default":0},
+ {"name": "protocolStatus", "type": ["null", {
"name": "ProtocolStatus",
"type": "record",
"namespace": "org.apache.nutch.storage",
"fields": [
- {"name": "code", "type": "int"},
- {"name": "args", "type": {"type": "array", "items": "string"}},
- {"name": "lastModified", "type": "long"}
+ {"name": "code", "type": "int", "default":0},
+ {"name": "args", "type": {"type": "array", "items": "string"}, "default":[]},
+ {"name": "lastModified", "type": "long", "default":0}
]
- }},
- {"name": "content", "type": "bytes"},
- {"name": "contentType", "type": "string"},
- {"name": "prevSignature", "type": "bytes"},
- {"name": "signature", "type": "bytes"},
- {"name": "title", "type": "string"},
- {"name": "text", "type": "string"},
- {"name": "parseStatus", "type": {
+ }], "default":null},
+ {"name": "content", "type": ["null","bytes"], "default":null},
+ {"name": "contentType", "type": ["null","string"], "default":null},
+ {"name": "prevSignature", "type": ["null","bytes"], "default":null},
+ {"name": "signature", "type": ["null","bytes"], "default":null},
+ {"name": "title", "type": ["null","string"], "default":null},
+ {"name": "text", "type": ["null","string"], "default":null},
+ {"name": "parseStatus", "type": ["null", {
"name": "ParseStatus",
"type": "record",
"namespace": "org.apache.nutch.storage",
"fields": [
- {"name": "majorCode", "type": "int"},
- {"name": "minorCode", "type": "int"},
- {"name": "args", "type": {"type": "array", "items": "string"}}
+ {"name": "majorCode", "type": "int", "default":0},
+ {"name": "minorCode", "type": "int", "default":0},
+ {"name": "args", "type": {"type": "array", "items": "string"}, "default":[]}
]
- }},
- {"name": "score", "type": "float"},
- {"name": "reprUrl", "type": "string"},
- {"name": "headers", "type": {"type": "map", "values": "string"}},
- {"name": "outlinks", "type": {"type": "map", "values": "string"}},
- {"name": "inlinks", "type": {"type": "map", "values": "string"}},
- {"name": "markers", "type": {"type": "map", "values": "string"}},
- {"name": "metadata", "type": {"type": "map", "values": "bytes"}},
- {"name": "batchId", "type": "string"}
+ }], "default":null},
+ {"name": "score", "type": "float", "default":0},
+ {"name": "reprUrl", "type": ["null","string"], "default":null},
+ {"name": "headers", "type": {"type":"map", "values": ["null","string"]}, "default":{}},
+ {"name": "outlinks", "type": {"type": "map", "values": ["null","string"]}, "default":{}},
+ {"name": "inlinks", "type": {"type": "map", "values": ["null","string"]}, "default":{}},
+ {"name": "markers", "type": {"type": "map", "values": ["null","string"]}, "default":{}},
+ {"name": "metadata", "type": {"type": "map", "values": ["null","bytes"]}, "default":{}},
+ {"name": "batchId", "type": ["null","string"], "default":null}
]
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/api/DbReader.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/api/DbReader.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/api/DbReader.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/api/DbReader.java Thu May 15 08:10:07 2014
@@ -16,16 +16,7 @@
******************************************************************************/
package org.apache.nutch.api;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.TreeSet;
-
+import org.apache.avro.Schema;
import org.apache.avro.util.Utf8;
import org.apache.gora.query.Query;
import org.apache.gora.query.Result;
@@ -34,11 +25,7 @@ import org.apache.hadoop.conf.Configurat
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.ParseStatusUtils;
import org.apache.nutch.protocol.ProtocolStatusUtils;
-import org.apache.nutch.storage.Mark;
-import org.apache.nutch.storage.ParseStatus;
-import org.apache.nutch.storage.ProtocolStatus;
-import org.apache.nutch.storage.StorageUtils;
-import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.storage.*;
import org.apache.nutch.util.Bytes;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.StringUtil;
@@ -46,6 +33,11 @@ import org.apache.nutch.util.TableUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.Map.Entry;
+
public class DbReader {
private static final Logger LOG = LoggerFactory.getLogger(DbReader.class);
@@ -145,7 +137,7 @@ public class DbReader {
public Map<String,Object> next() {
url = res.getKey();
- page = (WebPage)res.get().clone();
+ page = WebPage.newBuilder(res.get()).build();
try {
advance();
if (!hasNext) {
@@ -169,16 +161,16 @@ public class DbReader {
if (fields == null || fields.contains("url")) {
res.put("url", TableUtil.unreverseUrl(url));
}
- String[] pfields = page.getFields();
- TreeSet<String> flds = null;
+ List<Schema.Field> pfields = page.getSchema().getFields();
+ TreeSet<Schema.Field> flds = null;
if (fields != null) {
- flds = (TreeSet<String>)fields.clone();
+ flds = (TreeSet<Schema.Field>) fields.clone();
} else {
- flds = new TreeSet<String>(Arrays.asList(pfields));
+ flds = new TreeSet<Schema.Field>(pfields);
}
flds.retainAll(Arrays.asList(pfields));
- for (String f : flds) {
- int idx = page.getFieldIndex(f);
+ for (Schema.Field f : flds) {
+ int idx = f.pos();
if (idx < 0) {
continue;
}
@@ -187,43 +179,43 @@ public class DbReader {
continue;
}
if ("metadata".equals(f)) {
- Map<Utf8, ByteBuffer> metadata = page.getMetadata();
+ Map<CharSequence, ByteBuffer> metadata = page.getMetadata();
Map<String,String> simpleMeta = new HashMap<String,String>();
if (metadata != null) {
- Iterator<Entry<Utf8, ByteBuffer>> iterator = metadata.entrySet()
+ Iterator<Entry<CharSequence, ByteBuffer>> iterator = metadata.entrySet()
.iterator();
while (iterator.hasNext()) {
- Entry<Utf8, ByteBuffer> entry = iterator.next();
+ Entry<CharSequence, ByteBuffer> entry = iterator.next();
simpleMeta.put(entry.getKey().toString(),
Bytes.toStringBinary(entry.getValue()));
}
}
- res.put(f, simpleMeta);
+ res.put(f.name(), simpleMeta);
} else if ("protocolStatus".equals(f)) {
ProtocolStatus ps = page.getProtocolStatus();
- res.put(f, ProtocolStatusUtils.toString(ps));
+ res.put(f.name(), ProtocolStatusUtils.toString(ps));
} else if ("parseStatus".equals(f)) {
ParseStatus ps = page.getParseStatus();
- res.put(f, ParseStatusUtils.toString(ps));
+ res.put(f.name(), ParseStatusUtils.toString(ps));
} else if ("signature".equals(f)) {
ByteBuffer bb = page.getSignature();
- res.put(f, StringUtil.toHexString(bb));
+ res.put(f.name(), StringUtil.toHexString(bb));
} else if ("content".equals(f)) {
ByteBuffer bb = page.getContent();
- res.put(f, Bytes.toStringBinary(bb));
+ res.put(f.name(), Bytes.toStringBinary(bb));
} else if ("markers".equals(f)) {
- res.put(f, convertMap(page.getMarkers()));
+ res.put(f.name(), convertMap(page.getMarkers()));
} else if ("inlinks".equals(f)) {
- res.put(f, convertMap(page.getInlinks()));
+ res.put(f.name(), convertMap(page.getInlinks()));
} else if ("outlinks".equals(f)) {
- res.put(f, convertMap(page.getOutlinks()));
+ res.put(f.name(), convertMap(page.getOutlinks()));
} else {
if (val instanceof Utf8) {
val = val.toString();
} else if (val instanceof ByteBuffer) {
val = Bytes.toStringBinary((ByteBuffer)val);
}
- res.put(f, val);
+ res.put(f.name(), val);
}
}
return res;
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Thu May 15 08:10:07 2014
@@ -17,14 +17,14 @@
package org.apache.nutch.crawl;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.nutch.storage.WebPage;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.HashSet;
+import java.util.Set;
/**
* This class provides common methods for implementations of
@@ -196,7 +196,7 @@ implements FetchSchedule {
// reduce fetchInterval so that it fits within the max value
if (page.getFetchInterval() > maxInterval)
page.setFetchInterval(Math.round(maxInterval * 0.9f));
- page.setStatus(CrawlStatus.STATUS_UNFETCHED);
+ page.setStatus((int) CrawlStatus.STATUS_UNFETCHED);
page.setRetriesSinceFetch(0);
// TODO: row.setSignature(null) ??
page.setModifiedTime(0L);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateMapper.java Thu May 15 08:10:07 2014
@@ -67,11 +67,11 @@ extends GoraMapper<String, WebPage, UrlW
String url = TableUtil.unreverseUrl(key);
scoreData.clear();
- Map<Utf8, Utf8> outlinks = page.getOutlinks();
+ Map<CharSequence, CharSequence> outlinks = page.getOutlinks();
if (outlinks != null) {
- for (Entry<Utf8, Utf8> e : outlinks.entrySet()) {
+ for (Entry<CharSequence, CharSequence> e : outlinks.entrySet()) {
int depth=Integer.MAX_VALUE;
- Utf8 depthUtf8=page.getFromMarkers(DbUpdaterJob.DISTANCE);
+ CharSequence depthUtf8 = page.getMarkers().get(DbUpdaterJob.DISTANCE);
if (depthUtf8 != null) depth=Integer.parseInt(depthUtf8.toString());
scoreData.add(new ScoreDatum(0.0f, e.getKey().toString(),
e.getValue().toString(), depth));
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java Thu May 15 08:10:07 2014
@@ -22,7 +22,8 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.avro.util.Utf8;
-import org.slf4j.Logger;
+import org.apache.gora.mapreduce.GoraReducer;
+import org.apache.gora.store.DataStore;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.StringUtils;
@@ -35,7 +36,7 @@ import org.apache.nutch.storage.Mark;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.TableUtil;
import org.apache.nutch.util.WebPageWritable;
-import org.apache.gora.mapreduce.GoraReducer;
+import org.slf4j.Logger;
public class DbUpdateReducer
extends GoraReducer<UrlWithScore, NutchWritable, String, WebPage> {
@@ -97,16 +98,16 @@ extends GoraReducer<UrlWithScore, NutchW
if (!additionsAllowed) {
return;
}
- page = new WebPage();
+ page = WebPage.newBuilder().build();
schedule.initializeSchedule(url, page);
- page.setStatus(CrawlStatus.STATUS_UNFETCHED);
+ page.setStatus((int) CrawlStatus.STATUS_UNFETCHED);
try {
scoringFilters.initialScore(url, page);
} catch (ScoringFilterException e) {
page.setScore(0.0f);
}
} else {
- byte status = (byte)page.getStatus();
+ byte status = page.getStatus().byteValue();
switch (status) {
case CrawlStatus.STATUS_FETCHED: // succesful fetch
case CrawlStatus.STATUS_REDIR_TEMP: // successful fetch, redirected
@@ -129,7 +130,7 @@ extends GoraReducer<UrlWithScore, NutchW
long prevFetchTime = page.getPrevFetchTime();
long modifiedTime = page.getModifiedTime();
long prevModifiedTime = page.getPrevModifiedTime();
- Utf8 lastModified = page.getFromHeaders(new Utf8("Last-Modified"));
+ CharSequence lastModified = page.getHeaders().get(new Utf8("Last-Modified"));
if ( lastModified != null ){
try {
modifiedTime = HttpDateFormat.toLong(lastModified.toString());
@@ -145,9 +146,9 @@ extends GoraReducer<UrlWithScore, NutchW
case CrawlStatus.STATUS_RETRY:
schedule.setPageRetrySchedule(url, page, 0L, page.getPrevModifiedTime(), page.getFetchTime());
if (page.getRetriesSinceFetch() < retryMax) {
- page.setStatus(CrawlStatus.STATUS_UNFETCHED);
+ page.setStatus((int)CrawlStatus.STATUS_UNFETCHED);
} else {
- page.setStatus(CrawlStatus.STATUS_GONE);
+ page.setStatus((int)CrawlStatus.STATUS_GONE);
}
break;
case CrawlStatus.STATUS_GONE:
@@ -171,15 +172,15 @@ extends GoraReducer<UrlWithScore, NutchW
if (inlinkDist < smallestDist) {
smallestDist=inlinkDist;
}
- page.putToInlinks(new Utf8(inlink.getUrl()), new Utf8(inlink.getAnchor()));
+ page.getInlinks().put(new Utf8(inlink.getUrl()), new Utf8(inlink.getAnchor()));
}
if (smallestDist != Integer.MAX_VALUE) {
int oldDistance=Integer.MAX_VALUE;
- Utf8 oldDistUtf8 = page.getFromMarkers(DbUpdaterJob.DISTANCE);
+ CharSequence oldDistUtf8 = page.getMarkers().get(DbUpdaterJob.DISTANCE);
if (oldDistUtf8 != null)oldDistance=Integer.parseInt(oldDistUtf8.toString());
int newDistance = smallestDist+1;
if (newDistance < oldDistance) {
- page.putToMarkers(DbUpdaterJob.DISTANCE, new Utf8(Integer.toString(newDistance)));
+ page.getMarkers().put(DbUpdaterJob.DISTANCE, new Utf8(Integer.toString(newDistance)));
}
}
@@ -193,8 +194,8 @@ extends GoraReducer<UrlWithScore, NutchW
// clear markers
// But only delete when they exist. This is much faster for the underlying
// store. The markers are on the input anyway.
- if (page.getFromMetadata(FetcherJob.REDIRECT_DISCOVERED) != null) {
- page.removeFromMetadata(FetcherJob.REDIRECT_DISCOVERED);
+ if (page.getMetadata().get(FetcherJob.REDIRECT_DISCOVERED) != null) {
+ page.getMetadata().put(FetcherJob.REDIRECT_DISCOVERED, null);
}
Mark.GENERATE_MARK.removeMarkIfExist(page);
Mark.FETCH_MARK.removeMarkIfExist(page);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorMapper.java Thu May 15 08:10:07 2014
@@ -16,9 +16,8 @@
******************************************************************************/
package org.apache.nutch.crawl;
-import java.io.IOException;
-import java.net.MalformedURLException;
-
+import org.apache.avro.util.Utf8;
+import org.apache.gora.mapreduce.GoraMapper;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.GeneratorJob.SelectorEntry;
import org.apache.nutch.net.URLFilterException;
@@ -29,8 +28,11 @@ import org.apache.nutch.scoring.ScoringF
import org.apache.nutch.storage.Mark;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.TableUtil;
-import org.apache.avro.util.Utf8;
-import org.apache.gora.mapreduce.GoraMapper;
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.nio.ByteBuffer;
+import java.util.HashMap;
public class GeneratorMapper
extends GoraMapper<String, WebPage, SelectorEntry, WebPage> {
@@ -59,7 +61,7 @@ extends GoraMapper<String, WebPage, Sele
//filter on distance
if (maxDistance > -1) {
- Utf8 distanceUtf8 = page.getFromMarkers(DbUpdaterJob.DISTANCE);
+ CharSequence distanceUtf8 = page.getMarkers().get(DbUpdaterJob.DISTANCE);
if (distanceUtf8 != null) {
int distance=Integer.parseInt(distanceUtf8.toString());
if (distance > maxDistance) {
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorReducer.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorReducer.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorReducer.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorReducer.java Thu May 15 08:10:07 2014
@@ -22,6 +22,10 @@ import java.util.HashMap;
import java.util.Map;
import org.apache.avro.util.Utf8;
+import org.apache.gora.mapreduce.GoraReducer;
+import org.apache.gora.query.Query;
+import org.apache.gora.query.Result;
+import org.apache.gora.store.DataStore;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.crawl.GeneratorJob.SelectorEntry;
import org.apache.nutch.fetcher.FetcherJob.FetcherMapper;
@@ -29,7 +33,6 @@ import org.apache.nutch.storage.Mark;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.TableUtil;
import org.apache.nutch.util.URLUtil;
-import org.apache.gora.mapreduce.GoraReducer;
/** Reduce class for generate
*
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/InjectorJob.java Thu May 15 08:10:07 2014
@@ -16,15 +16,6 @@
******************************************************************************/
package org.apache.nutch.crawl;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.text.SimpleDateFormat;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeMap;
-
import org.apache.avro.util.Utf8;
import org.apache.gora.mapreduce.GoraOutputFormat;
import org.apache.gora.persistency.Persistent;
@@ -47,15 +38,15 @@ import org.apache.nutch.scoring.ScoringF
import org.apache.nutch.storage.Mark;
import org.apache.nutch.storage.StorageUtils;
import org.apache.nutch.storage.WebPage;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.NutchTool;
-import org.apache.nutch.util.TableUtil;
-import org.apache.nutch.util.TimingUtil;
-import org.apache.nutch.util.ToolUtil;
+import org.apache.nutch.util.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.text.SimpleDateFormat;
+import java.util.*;
+
/** This class takes a flat file of URLs and adds them to the of pages to be
* crawled. Useful for bootstrapping the system.
* The URL files contain one URL per line, optionally followed by custom metadata
@@ -160,7 +151,7 @@ public class InjectorJob extends NutchTo
return;
} else { // if it passes
String reversedUrl = TableUtil.reverseUrl(url); // collect it
- WebPage row = new WebPage();
+ WebPage row = WebPage.newBuilder().build();
row.setFetchTime(curTime);
row.setFetchInterval(customInterval);
@@ -169,7 +160,7 @@ public class InjectorJob extends NutchTo
while (keysIter.hasNext()) {
String keymd = keysIter.next();
String valuemd = metadata.get(keymd);
- row.putToMetadata(new Utf8(keymd), ByteBuffer.wrap(valuemd.getBytes()));
+ row.getMetadata().put(new Utf8(keymd), ByteBuffer.wrap(valuemd.getBytes()));
}
if (customScore != -1)
@@ -186,7 +177,7 @@ public class InjectorJob extends NutchTo
}
}
context.getCounter("injector", "urls_injected").increment(1);
- row.putToMarkers(DbUpdaterJob.DISTANCE, new Utf8(String.valueOf(0)));
+ row.getMarkers().put(DbUpdaterJob.DISTANCE, new Utf8(String.valueOf(0)));
Mark.INJECT_MARK.putMark(row, YES_STRING);
context.write(reversedUrl, row);
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/MD5Signature.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/MD5Signature.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/MD5Signature.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/MD5Signature.java Thu May 15 08:10:07 2014
@@ -17,14 +17,14 @@
package org.apache.nutch.crawl;
-import java.nio.ByteBuffer;
-import java.util.Collection;
-import java.util.HashSet;
-
import org.apache.avro.util.Utf8;
import org.apache.hadoop.io.MD5Hash;
import org.apache.nutch.storage.WebPage;
+import java.nio.ByteBuffer;
+import java.util.Collection;
+import java.util.HashSet;
+
/**
* Default implementation of a page signature. It calculates an MD5 hash
* of the raw binary content of a page. In case there is no content, it
@@ -47,7 +47,7 @@ public class MD5Signature extends Signat
int of;
int cb;
if (buf == null) {
- Utf8 baseUrl = page.getBaseUrl();
+ Utf8 baseUrl = (Utf8) page.getBaseUrl();
if (baseUrl == null) {
data = null;
of = 0;
@@ -56,7 +56,7 @@ public class MD5Signature extends Signat
else {
data = baseUrl.getBytes();
of = 0;
- cb = baseUrl.getLength();
+ cb = baseUrl.length();
}
} else {
data = buf.array();
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java Thu May 15 08:10:07 2014
@@ -16,18 +16,11 @@
******************************************************************************/
package org.apache.nutch.crawl;
-import java.io.IOException;
-import java.net.URL;
-import java.nio.ByteBuffer;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.TreeMap;
-import java.util.regex.Pattern;
-
import org.apache.avro.util.Utf8;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
+import org.apache.gora.mapreduce.GoraMapper;
+import org.apache.gora.query.Query;
+import org.apache.gora.query.Result;
+import org.apache.gora.store.DataStore;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -47,17 +40,19 @@ import org.apache.nutch.parse.ParseStatu
import org.apache.nutch.protocol.ProtocolStatusUtils;
import org.apache.nutch.storage.StorageUtils;
import org.apache.nutch.storage.WebPage;
-import org.apache.nutch.util.Bytes;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.nutch.util.NutchJob;
-import org.apache.nutch.util.NutchTool;
-import org.apache.nutch.util.StringUtil;
-import org.apache.nutch.util.TableUtil;
-import org.apache.nutch.util.ToolUtil;
-import org.apache.gora.mapreduce.GoraMapper;
-import org.apache.gora.query.Query;
-import org.apache.gora.query.Result;
-import org.apache.gora.store.DataStore;
+import org.apache.nutch.util.*;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.net.URL;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.TreeMap;
+import java.util.regex.Pattern;
/**
* Displays information about the entries of the webtable
@@ -320,7 +315,10 @@ public class WebTableReader extends Nutc
DataStore<String, WebPage> store = StorageUtils.createWebStore(job
.getConfiguration(), String.class, WebPage.class);
Query<String, WebPage> query = store.newQuery();
- query.setFields(WebPage._ALL_FIELDS);
+ //remove the __g__dirty field since it is not stored
+ String[] fields = Arrays.copyOfRange(WebPage._ALL_FIELDS, 1,
+ WebPage._ALL_FIELDS.length);
+ query.setFields(fields);
GoraMapper.initMapperJob(job, query, store, Text.class, Text.class,
WebTableRegexMapper.class, null, true);
@@ -344,7 +342,7 @@ public class WebTableReader extends Nutc
sb.append("key:\t" + key).append("\n");
sb.append("baseUrl:\t" + page.getBaseUrl()).append("\n");
sb.append("status:\t").append(page.getStatus()).append(" (").append(
- CrawlStatus.getName((byte) page.getStatus())).append(")\n");
+ CrawlStatus.getName(page.getStatus().byteValue())).append(")\n");
sb.append("fetchTime:\t" + page.getFetchTime()).append("\n");
sb.append("prevFetchTime:\t" + page.getPrevFetchTime()).append("\n");
sb.append("fetchInterval:\t" + page.getFetchInterval()).append("\n");
@@ -366,41 +364,41 @@ public class WebTableReader extends Nutc
sb.append("title:\t" + page.getTitle()).append("\n");
sb.append("score:\t" + page.getScore()).append("\n");
- Map<Utf8, Utf8> markers = page.getMarkers();
+ Map<CharSequence, CharSequence> markers = page.getMarkers();
sb.append("markers:\t" + markers).append("\n");
sb.append("reprUrl:\t" + page.getReprUrl()).append("\n");
- Utf8 batchId = page.getBatchId();
+ CharSequence batchId = page.getBatchId();
if (batchId != null) {
sb.append("batchId:\t" + batchId.toString()).append("\n");
}
- Map<Utf8, ByteBuffer> metadata = page.getMetadata();
+ Map<CharSequence, ByteBuffer> metadata = page.getMetadata();
if (metadata != null) {
- Iterator<Entry<Utf8, ByteBuffer>> iterator = metadata.entrySet()
+ Iterator<Entry<CharSequence, ByteBuffer>> iterator = metadata.entrySet()
.iterator();
while (iterator.hasNext()) {
- Entry<Utf8, ByteBuffer> entry = iterator.next();
+ Entry<CharSequence, ByteBuffer> entry = iterator.next();
sb.append("metadata " + entry.getKey().toString()).append(" : \t")
.append(Bytes.toString(entry.getValue())).append("\n");
}
}
if (dumpLinks) {
- Map<Utf8,Utf8> inlinks = page.getInlinks();
- Map<Utf8,Utf8> outlinks = page.getOutlinks();
+ Map<CharSequence, CharSequence> inlinks = page.getInlinks();
+ Map<CharSequence, CharSequence> outlinks = page.getOutlinks();
if (outlinks != null) {
- for (Entry<Utf8,Utf8> e : outlinks.entrySet()) {
+ for (Entry<CharSequence, CharSequence> e : outlinks.entrySet()) {
sb.append("outlink:\t" + e.getKey() + "\t" + e.getValue() + "\n");
}
}
if (inlinks != null) {
- for (Entry<Utf8,Utf8> e : inlinks.entrySet()) {
+ for (Entry<CharSequence, CharSequence> e : inlinks.entrySet()) {
sb.append("inlink:\t" + e.getKey() + "\t" + e.getValue() + "\n");
}
}
}
if (dumpHeaders) {
- Map<Utf8,Utf8> headers = page.getHeaders();
+ Map<CharSequence, CharSequence> headers = page.getHeaders();
if (headers != null) {
- for (Entry<Utf8,Utf8> e : headers.entrySet()) {
+ for (Entry<CharSequence, CharSequence> e : headers.entrySet()) {
sb.append("header:\t" + e.getKey() + "\t" + e.getValue() + "\n");
}
}
@@ -412,7 +410,7 @@ public class WebTableReader extends Nutc
sb.append(Bytes.toString(content));
sb.append("\ncontent:end:\n");
}
- Utf8 text = page.getText();
+ CharSequence text = page.getText();
if (text != null && dumpText) {
sb.append("text:start:\n");
sb.append(text.toString());
@@ -521,7 +519,11 @@ public class WebTableReader extends Nutc
DataStore<String, WebPage> store = StorageUtils.createWebStore(currentJob
.getConfiguration(), String.class, WebPage.class);
Query<String, WebPage> query = store.newQuery();
- query.setFields(WebPage._ALL_FIELDS);
+
+ //remove the __g__dirty field since it is not stored
+ String[] fields = Arrays.copyOfRange(WebPage._ALL_FIELDS, 1,
+ WebPage._ALL_FIELDS.length);
+ query.setFields(fields);
GoraMapper.initMapperJob(currentJob, query, store, Text.class, LongWritable.class,
WebTableStatMapper.class, null, true);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java Thu May 15 08:10:07 2014
@@ -16,43 +16,19 @@
******************************************************************************/
package org.apache.nutch.fetcher;
-import java.io.IOException;
-import java.net.InetAddress;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.net.UnknownHostException;
-import java.nio.ByteBuffer;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.concurrent.atomic.AtomicInteger;
-import java.util.concurrent.atomic.AtomicLong;
-
+import crawlercommons.robots.BaseRobotRules;
import org.apache.avro.util.Utf8;
import org.apache.gora.mapreduce.GoraReducer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.nutch.crawl.CrawlStatus;
-import org.apache.nutch.crawl.URLWebPage;
import org.apache.nutch.host.HostDb;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParserJob;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.ProtocolOutput;
-import org.apache.nutch.protocol.ProtocolStatusCodes;
-import org.apache.nutch.protocol.ProtocolStatusUtils;
-import org.apache.nutch.protocol.RobotRules;
+import org.apache.nutch.protocol.*;
import org.apache.nutch.storage.Host;
import org.apache.nutch.storage.Mark;
import org.apache.nutch.storage.ProtocolStatus;
@@ -61,7 +37,14 @@ import org.apache.nutch.util.TableUtil;
import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
-import crawlercommons.robots.BaseRobotRules;
+import java.io.IOException;
+import java.net.InetAddress;
+import java.net.URL;
+import java.net.UnknownHostException;
+import java.nio.ByteBuffer;
+import java.util.*;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
public class FetcherReducer
extends GoraReducer<IntWritable, FetchEntry, String, WebPage> {
@@ -478,7 +461,7 @@ extends GoraReducer<IntWritable, FetchEn
}
}
lastRequestStart.set(System.currentTimeMillis());
- if (!fit.page.isReadable(WebPage.Field.REPR_URL.getIndex())) {
+ if (fit.page.getReprUrl() == null) {
reprUrl = fit.url;
} else {
reprUrl = TableUtil.toString(fit.page.getReprUrl());
@@ -615,8 +598,8 @@ extends GoraReducer<IntWritable, FetchEn
}
}
- page.putToOutlinks(new Utf8(newUrl), new Utf8());
- page.putToMetadata(FetcherJob.REDIRECT_DISCOVERED, TableUtil.YES_VAL);
+ page.getOutlinks().put(new Utf8(newUrl), new Utf8());
+ page.getMetadata().put(FetcherJob.REDIRECT_DISCOVERED, TableUtil.YES_VAL);
reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
if (reprUrl == null) {
LOG.warn("reprUrl==null");
@@ -638,7 +621,7 @@ extends GoraReducer<IntWritable, FetchEn
private void output(FetchItem fit, Content content,
ProtocolStatus pstatus, byte status)
throws IOException, InterruptedException {
- fit.page.setStatus(status);
+ fit.page.setStatus((int)status);
final long prevFetchTime = fit.page.getFetchTime();
fit.page.setPrevFetchTime(prevFetchTime);
fit.page.setFetchTime(System.currentTimeMillis());
Modified: nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbUpdateReducer.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbUpdateReducer.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbUpdateReducer.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/host/HostDbUpdateReducer.java Thu May 15 08:10:07 2014
@@ -16,10 +16,6 @@
******************************************************************************/
package org.apache.nutch.host;
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.Set;
-
import org.apache.avro.util.Utf8;
import org.apache.gora.mapreduce.GoraReducer;
import org.apache.hadoop.io.Text;
@@ -29,6 +25,10 @@ import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.Histogram;
import org.apache.nutch.util.URLUtil;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.Set;
+
/**
* Combines all WebPages with the same host key to create a Host object,
* with some statistics.
@@ -58,15 +58,15 @@ public class HostDbUpdateReducer extends
// TODO: limit number of links
if (buildLinkDb) {
if (page.getInlinks() != null) {
- Set<Utf8> inlinks = page.getInlinks().keySet();
- for (Utf8 inlink: inlinks) {
+ Set<CharSequence> inlinks = page.getInlinks().keySet();
+ for (CharSequence inlink: inlinks) {
String host = URLUtil.getHost(inlink.toString());
inlinkCount.add(host);
}
}
if (page.getOutlinks() != null) {
- Set<Utf8> outlinks = page.getOutlinks().keySet();
- for (Utf8 outlink: outlinks) {
+ Set<CharSequence> outlinks = page.getOutlinks().keySet();
+ for (CharSequence outlink: outlinks) {
String host = URLUtil.getHost(outlink.toString());
outlinkCount.add(host);
}
@@ -76,15 +76,15 @@ public class HostDbUpdateReducer extends
// output host data
Host host = new Host();
- host.putToMetadata(new Utf8("p"),ByteBuffer.wrap(Integer.toString(numPages).getBytes()));
+ host.getMetadata().put(new Utf8("p"),ByteBuffer.wrap(Integer.toString(numPages).getBytes()));
if (numFetched > 0) {
- host.putToMetadata(new Utf8("f"),ByteBuffer.wrap(Integer.toString(numFetched).getBytes()));
+ host.getMetadata().put(new Utf8("f"),ByteBuffer.wrap(Integer.toString(numFetched).getBytes()));
}
for (String inlink: inlinkCount.getKeys()) {
- host.putToInlinks(new Utf8(inlink), new Utf8(Integer.toString(inlinkCount.getCount(inlink))));
+ host.getInlinks().put(new Utf8(inlink), new Utf8(Integer.toString(inlinkCount.getCount(inlink))));
}
for (String outlink: outlinkCount.getKeys()) {
- host.putToOutlinks(new Utf8(outlink), new Utf8(Integer.toString(outlinkCount.getCount(outlink))));
+ host.getInlinks().put(new Utf8(outlink), new Utf8(Integer.toString(outlinkCount.getCount(outlink))));
}
context.write(key.toString(), host);
Modified: nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java Thu May 15 08:10:07 2014
@@ -16,15 +16,6 @@
******************************************************************************/
package org.apache.nutch.host;
-import java.io.IOException;
-import java.net.URL;
-import java.nio.ByteBuffer;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeMap;
-
import org.apache.avro.util.Utf8;
import org.apache.gora.mapreduce.GoraOutputFormat;
import org.apache.hadoop.conf.Configuration;
@@ -46,6 +37,11 @@ import org.apache.nutch.util.TableUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.io.IOException;
+import java.net.URL;
+import java.nio.ByteBuffer;
+import java.util.*;
+
/**
* Creates or updates an existing host table from a text file.<br>
* The files contain one host name per line, optionally followed by custom
@@ -127,7 +123,7 @@ public class HostInjectorJob implements
while (keysIter.hasNext()) {
String keymd = keysIter.next();
String valuemd = metadata.get(keymd);
- host.putToMetadata(new Utf8(keymd), ByteBuffer.wrap(valuemd.getBytes()));
+ host.getMetadata().put(new Utf8(keymd), ByteBuffer.wrap(valuemd.getBytes()));
}
String hostname;
if (url.indexOf("://")> -1) {
Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Thu May 15 08:10:07 2014
@@ -80,12 +80,12 @@ public class IndexingFiltersChecker exte
ProtocolFactory factory = new ProtocolFactory(conf);
Protocol protocol = factory.getProtocol(url);
- WebPage page = new WebPage();
+ WebPage page = WebPage.newBuilder().build();
page.setBaseUrl(new org.apache.avro.util.Utf8(url));
ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page);
page.setProtocolStatus(protocolOutput.getStatus());
if (protocolOutput.getStatus().getCode() == ProtocolStatusCodes.SUCCESS) {
- page.setStatus(CrawlStatus.STATUS_FETCHED);
+ page.setStatus((int)CrawlStatus.STATUS_FETCHED);
page.setFetchTime(System.currentTimeMillis());
} else {
LOG.error("Fetch failed with protocol status: "
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseStatusUtils.java Thu May 15 08:10:07 2014
@@ -16,22 +16,23 @@
******************************************************************************/
package org.apache.nutch.parse;
-import java.util.HashMap;
-import java.util.Iterator;
-
import org.apache.avro.generic.GenericArray;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.storage.ParseStatus;
import org.apache.nutch.util.TableUtil;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+
public class ParseStatusUtils {
- public static ParseStatus STATUS_SUCCESS = new ParseStatus();
+ public static ParseStatus STATUS_SUCCESS = ParseStatus.newBuilder().build();
public static final HashMap<Short,String> minorCodes = new HashMap<Short,String>();
static {
- STATUS_SUCCESS.setMajorCode(ParseStatusCodes.SUCCESS);
+ STATUS_SUCCESS.setMajorCode((int)ParseStatusCodes.SUCCESS);
minorCodes.put(ParseStatusCodes.SUCCESS_OK, "ok");
minorCodes.put(ParseStatusCodes.SUCCESS_REDIRECT, "redirect");
minorCodes.put(ParseStatusCodes.FAILED_EXCEPTION, "exception");
@@ -52,7 +53,7 @@ public class ParseStatusUtils {
* argument, or null.
*/
public static String getMessage(ParseStatus status) {
- GenericArray<Utf8> args = status.getArgs();
+ List<CharSequence> args = status.getArgs();
if (args != null && args.size() > 0) {
return TableUtil.toString(args.iterator().next());
}
@@ -60,12 +61,12 @@ public class ParseStatusUtils {
}
public static String getArg(ParseStatus status, int n) {
- GenericArray<Utf8> args = status.getArgs();
+ List<CharSequence> args = status.getArgs();
if (args == null) {
return null;
}
int i = 0;
- for (Utf8 arg : args) {
+ for (CharSequence arg : args) {
if (i == n) {
return TableUtil.toString(arg);
}
@@ -75,19 +76,19 @@ public class ParseStatusUtils {
}
public static Parse getEmptyParse(Exception e, Configuration conf) {
- ParseStatus status = new ParseStatus();
- status.setMajorCode(ParseStatusCodes.FAILED);
- status.setMinorCode(ParseStatusCodes.FAILED_EXCEPTION);
- status.addToArgs(new Utf8(e.toString()));
+ ParseStatus status = ParseStatus.newBuilder().build();
+ status.setMajorCode((int)ParseStatusCodes.FAILED);
+ status.setMinorCode((int)ParseStatusCodes.FAILED_EXCEPTION);
+ status.getArgs().add(new Utf8(e.toString()));
return new Parse("", "", new Outlink[0], status);
}
public static Parse getEmptyParse(int minorCode, String message, Configuration conf) {
- ParseStatus status = new ParseStatus();
- status.setMajorCode(ParseStatusCodes.FAILED);
+ ParseStatus status = ParseStatus.newBuilder().build();
+ status.setMajorCode((int)ParseStatusCodes.FAILED);
status.setMinorCode(minorCode);
- status.addToArgs(new Utf8(message));
+ status.getArgs().add(new Utf8(message));
return new Parse("", "", new Outlink[0], status);
}
@@ -98,13 +99,13 @@ public class ParseStatusUtils {
}
StringBuilder sb = new StringBuilder();
sb.append(ParseStatusCodes.majorCodes[status.getMajorCode()] +
- "/" + minorCodes.get((short)status.getMinorCode()));
+ "/" + minorCodes.get(status.getMinorCode().shortValue()));
sb.append(" (" + status.getMajorCode() + "/" + status.getMinorCode() + ")");
sb.append(", args=[");
- GenericArray<Utf8> args = status.getArgs();
+ List<CharSequence> args = status.getArgs();
if (args != null) {
int i = 0;
- Iterator<Utf8> it = args.iterator();
+ Iterator<CharSequence> it = args.iterator();
while (it.hasNext()) {
if (i > 0) sb.append(',');
sb.append(it.next());
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java Thu May 15 08:10:07 2014
@@ -17,14 +17,8 @@
package org.apache.nutch.parse;
// Commons Logging imports
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.nio.ByteBuffer;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-import java.util.concurrent.TimeUnit;
+import com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
@@ -43,7 +37,13 @@ import org.apache.nutch.util.URLUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import com.google.common.util.concurrent.ThreadFactoryBuilder;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.nio.ByteBuffer;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
/**
* A Utility class containing methods to simply perform parsing utilities such
@@ -162,7 +162,7 @@ public class ParseUtil extends Configure
*/
public void process(String key, WebPage page) {
String url = TableUtil.unreverseUrl(key);
- byte status = (byte) page.getStatus();
+ byte status = page.getStatus().byteValue();
if (status != CrawlStatus.STATUS_FETCHED) {
if (LOG.isDebugEnabled()) {
LOG.debug("Skipping " + url + " as status is: " + CrawlStatus.getName(status));
@@ -214,8 +214,8 @@ public class ParseUtil extends Configure
LOG.warn("malformed url exception parsing redirect " + url);
return;
}
- page.putToOutlinks(new Utf8(newUrl), new Utf8());
- page.putToMetadata(FetcherJob.REDIRECT_DISCOVERED, TableUtil.YES_VAL);
+ page.getOutlinks().put(new Utf8(newUrl), new Utf8());
+ page.getMetadata().put(FetcherJob.REDIRECT_DISCOVERED, TableUtil.YES_VAL);
if (newUrl == null || newUrl.equals(url)) {
String reprUrl = URLUtil.chooseRepr(url, newUrl,
refreshTime < FetcherJob.PERM_REFRESH_TIME);
@@ -265,7 +265,7 @@ public class ParseUtil extends Configure
continue;
}
Utf8 utf8ToUrl = new Utf8(toUrl);
- if (page.getFromOutlinks(utf8ToUrl) != null) {
+ if (page.getOutlinks().get(utf8ToUrl) != null) {
// skip duplicate outlinks
continue;
}
@@ -281,7 +281,7 @@ public class ParseUtil extends Configure
}
}
validCount++;
- page.putToOutlinks(utf8ToUrl, new Utf8(outlinks[i].getAnchor()));
+ page.getOutlinks().put(utf8ToUrl, new Utf8(outlinks[i].getAnchor()));
}
Utf8 fetchMark = Mark.FETCH_MARK.checkMark(page);
if (fetchMark != null) {
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java Thu May 15 08:10:07 2014
@@ -17,28 +17,24 @@
package org.apache.nutch.parse;
-import java.nio.ByteBuffer;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.Map.Entry;
-
import org.apache.avro.util.Utf8;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.SignatureFactory;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.ProtocolOutput;
-import org.apache.nutch.protocol.ProtocolStatusUtils;
+import org.apache.nutch.protocol.*;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.Bytes;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.URLUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.nio.ByteBuffer;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.Map.Entry;
/**
* Parser checker, useful for testing parser.
@@ -107,7 +103,7 @@ public class ParserChecker implements To
ProtocolFactory factory = new ProtocolFactory(conf);
Protocol protocol = factory.getProtocol(url);
- WebPage page = new WebPage();
+ WebPage page = WebPage.newBuilder().build();
ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page);
@@ -163,13 +159,13 @@ public class ParserChecker implements To
LOG.info("---------\nUrl\n---------------\n");
System.out.print(url + "\n");
LOG.info("---------\nMetadata\n---------\n");
- Map<Utf8, ByteBuffer> metadata = page.getMetadata();
+ Map<CharSequence, ByteBuffer> metadata = page.getMetadata();
StringBuffer sb = new StringBuffer();
if (metadata != null) {
- Iterator<Entry<Utf8, ByteBuffer>> iterator = metadata.entrySet()
+ Iterator<Entry<CharSequence, ByteBuffer>> iterator = metadata.entrySet()
.iterator();
while (iterator.hasNext()) {
- Entry<Utf8, ByteBuffer> entry = iterator.next();
+ Entry<CharSequence, ByteBuffer> entry = iterator.next();
sb.append(entry.getKey().toString()).append(" : \t")
.append(Bytes.toString(entry.getValue())).append("\n");
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserJob.java Thu May 15 08:10:07 2014
@@ -102,7 +102,7 @@ public class ParserJob extends NutchTool
@Override
public void map(String key, WebPage page, Context context)
throws IOException, InterruptedException {
- Utf8 mark = Mark.FETCH_MARK.checkMark(page);
+ CharSequence mark = Mark.FETCH_MARK.checkMark(page);
String unreverseKey = TableUtil.unreverseUrl(key);
if (batchId.equals(REPARSE)) {
LOG.debug("Reparsing " + unreverseKey);
@@ -161,7 +161,7 @@ public class ParserJob extends NutchTool
if (content == null) {
return false;
}
- Utf8 lengthUtf8 = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_LENGTH));
+ CharSequence lengthUtf8 = page.getHeaders().get(new Utf8(HttpHeaders.CONTENT_LENGTH));
if (lengthUtf8 == null) {
return false;
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/ProtocolStatusUtils.java Thu May 15 08:10:07 2014
@@ -16,14 +16,15 @@
******************************************************************************/
package org.apache.nutch.protocol;
-import java.net.URL;
-import java.util.Iterator;
-
import org.apache.avro.generic.GenericArray;
import org.apache.avro.util.Utf8;
import org.apache.nutch.storage.ProtocolStatus;
import org.apache.nutch.util.TableUtil;
+import java.net.URL;
+import java.util.Iterator;
+import java.util.List;
+
public class ProtocolStatusUtils implements ProtocolStatusCodes {
// Useful static instances for status codes that don't usually require any
// additional arguments.
@@ -76,15 +77,15 @@ public class ProtocolStatusUtils impleme
}
public static ProtocolStatus makeStatus(int code) {
- ProtocolStatus pstatus = new ProtocolStatus();
+ ProtocolStatus pstatus = ProtocolStatus.newBuilder().build();
pstatus.setCode(code);
- pstatus.setLastModified(0);
+ pstatus.setLastModified(0L);
return pstatus;
}
public static ProtocolStatus makeStatus(int code, String message) {
ProtocolStatus pstatus = makeStatus(code);
- pstatus.addToArgs(new Utf8(message));
+ pstatus.getArgs().add(new Utf8(message));
return pstatus;
}
@@ -93,7 +94,7 @@ public class ProtocolStatusUtils impleme
}
public static String getMessage(ProtocolStatus pstatus) {
- GenericArray<Utf8> args = pstatus.getArgs();
+ List<CharSequence> args = pstatus.getArgs();
if (args == null || args.size() == 0) {
return null;
}
@@ -107,10 +108,10 @@ public class ProtocolStatusUtils impleme
StringBuilder sb = new StringBuilder();
sb.append(getName(status.getCode()));
sb.append(", args=[");
- GenericArray<Utf8> args = status.getArgs();
+ List<CharSequence> args = status.getArgs();
if (args != null) {
int i = 0;
- Iterator<Utf8> it = args.iterator();
+ Iterator<CharSequence> it = args.iterator();
while (it.hasNext()) {
if (i > 0) sb.append(',');
sb.append(it.next());
Modified: nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java?rev=1594812&r1=1594811&r2=1594812&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java Thu May 15 08:10:07 2014
@@ -1,157 +1,444 @@
-/*******************************************************************************
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
+/**
+ * Autogenerated by Avro
*
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- ******************************************************************************/
+ * DO NOT EDIT DIRECTLY
+ */
package org.apache.nutch.storage;
-import java.nio.ByteBuffer;
-import java.util.Map;
-import org.apache.avro.Schema;
-import org.apache.avro.AvroRuntimeException;
import org.apache.avro.util.Utf8;
-import org.apache.gora.persistency.StateManager;
-import org.apache.gora.persistency.impl.PersistentBase;
-import org.apache.gora.persistency.impl.StateManagerImpl;
-import org.apache.gora.persistency.StatefulHashMap;
import org.apache.nutch.util.Bytes;
@SuppressWarnings("all")
-public class Host extends PersistentBase {
- public static final org.apache.avro.Schema _SCHEMA = org.apache.avro.Schema.parse("{\"type\":\"record\",\"name\":\"Host\",\"namespace\":\"org.apache.nutch.storage\",\"fields\":[{\"name\":\"metadata\",\"type\":{\"type\":\"map\",\"values\":\"bytes\"}},{\"name\":\"outlinks\",\"type\":{\"type\":\"map\",\"values\":\"string\"}},{\"name\":\"inlinks\",\"type\":{\"type\":\"map\",\"values\":\"string\"}}]}");
- public java.util.Map<org.apache.avro.util.Utf8,java.nio.ByteBuffer> metadata;
- public java.util.Map<org.apache.avro.util.Utf8,org.apache.avro.util.Utf8> outlinks;
- public java.util.Map<org.apache.avro.util.Utf8,org.apache.avro.util.Utf8> inlinks;
-
+public class Host extends org.apache.gora.persistency.impl.PersistentBase implements org.apache.avro.specific.SpecificRecord, org.apache.gora.persistency.Persistent {
+ public static final org.apache.avro.Schema SCHEMA$ = new org.apache.avro.Schema.Parser().parse("{\"type\":\"record\",\"name\":\"Host\",\"namespace\":\"org.apache.nutch.storage\",\"fields\":[{\"name\":\"__g__dirty\",\"type\":\"bytes\",\"doc\":\"Bytes used to represent weather or not a field is dirty.\",\"default\":\"AA==\"},{\"name\":\"metadata\",\"type\":{\"type\":\"map\",\"values\":\"bytes\"},\"default\":{}},{\"name\":\"outlinks\",\"type\":{\"type\":\"map\",\"values\":\"string\"},\"default\":{}},{\"name\":\"inlinks\",\"type\":{\"type\":\"map\",\"values\":\"string\"},\"default\":{}}]}");
+
+ /** Enum containing all data bean's fields. */
public static enum Field {
- METADATA(0,"metadata"),
- OUTLINKS(1,"outlinks"),
- INLINKS(2,"inlinks"),
+ __G__DIRTY(0, "__g__dirty"),
+ METADATA(1, "metadata"),
+ OUTLINKS(2, "outlinks"),
+ INLINKS(3, "inlinks"),
;
+ /**
+ * Field's index.
+ */
private int index;
+
+ /**
+ * Field's name.
+ */
private String name;
+
+ /**
+ * Field's constructor
+ * @param index field's index.
+ * @param name field's name.
+ */
Field(int index, String name) {this.index=index;this.name=name;}
+
+ /**
+ * Gets field's index.
+ * @return int field's index.
+ */
public int getIndex() {return index;}
+
+ /**
+ * Gets field's name.
+ * @return String field's name.
+ */
public String getName() {return name;}
+
+ /**
+ * Gets field's attributes to string.
+ * @return String field's attributes to string.
+ */
public String toString() {return name;}
};
- public static final String[] _ALL_FIELDS = {"metadata","outlinks","inlinks"};
- static {
- PersistentBase.registerFields(Host.class, _ALL_FIELDS);
- }
-
- public Host() {
- this(new StateManagerImpl());
- }
- public Host(StateManager stateManager) {
- super(stateManager);
- metadata = new StatefulHashMap<Utf8,ByteBuffer>();
- inlinks = new StatefulHashMap<Utf8,Utf8>();
- outlinks = new StatefulHashMap<Utf8,Utf8>();
- }
- public Host newInstance(StateManager stateManager) {
- return new Host(stateManager);
- }
- public Schema getSchema() { return _SCHEMA; }
- public Object get(int _field) {
- switch (_field) {
- case 0: return metadata;
- case 1: return outlinks;
- case 2: return inlinks;
- default: throw new AvroRuntimeException("Bad index");
+
+ public static final String[] _ALL_FIELDS = {
+ "__g__dirty",
+ "metadata",
+ "outlinks",
+ "inlinks",
+ };
+
+ /** Bytes used to represent weather or not a field is dirty. */
+ private java.nio.ByteBuffer __g__dirty = java.nio.ByteBuffer.wrap(new byte[1]);
+ private java.util.Map<CharSequence,java.nio.ByteBuffer> metadata;
+ private java.util.Map<CharSequence,CharSequence> outlinks;
+ private java.util.Map<CharSequence,CharSequence> inlinks;
+ public org.apache.avro.Schema getSchema() { return SCHEMA$; }
+ // Used by DatumWriter. Applications should not call.
+ public Object get(int field$) {
+ switch (field$) {
+ case 0: return __g__dirty;
+ case 1: return metadata;
+ case 2: return outlinks;
+ case 3: return inlinks;
+ default: throw new org.apache.avro.AvroRuntimeException("Bad index");
}
}
+
+ // Used by DatumReader. Applications should not call.
@SuppressWarnings(value="unchecked")
- public void put(int _field, Object _value) {
-
- if(isFieldEqual(_field, _value)) return;
- getStateManager().setDirty(this, _field);
- switch (_field) {
- case 0: metadata = (Map<Utf8,ByteBuffer>)_value; break;
- case 1: outlinks = (Map<Utf8,Utf8>)_value; break;
- case 2: inlinks = (Map<Utf8,Utf8>)_value; break;
- default: throw new AvroRuntimeException("Bad index");
- }
- }
- @SuppressWarnings("unchecked")
- public Map<Utf8, ByteBuffer> getMetadata() {
- return (Map<Utf8, ByteBuffer>) get(0);
- }
- public ByteBuffer getFromMetadata(Utf8 key) {
- if (metadata == null) { return null; }
- return metadata.get(key);
- }
-
- public void putToMetadata(Utf8 key, ByteBuffer value) {
- getStateManager().setDirty(this, 0);
- metadata.put(key, value);
- }
- public ByteBuffer removeFromMetadata(Utf8 key) {
- if (metadata == null) { return null; }
- getStateManager().setDirty(this, 0);
- return metadata.remove(key);
- }
- @SuppressWarnings("unchecked")
- public Map<Utf8, Utf8> getOutlinks() {
- return (Map<Utf8, Utf8>) get(1);
- }
- public Utf8 getFromOutlinks(Utf8 key) {
- if (outlinks == null) { return null; }
- return outlinks.get(key);
- }
- public void putToOutlinks(Utf8 key, Utf8 value) {
- getStateManager().setDirty(this, 1);
- outlinks.put(key, value);
- }
- public Utf8 removeFromOutlinks(Utf8 key) {
- if (outlinks == null) { return null; }
- getStateManager().setDirty(this, 1);
- return outlinks.remove(key);
- }
- @SuppressWarnings("unchecked")
- public Map<Utf8, Utf8> getInlinks() {
- return (Map<Utf8, Utf8>) get(2);
- }
- public Utf8 getFromInlinks(Utf8 key) {
- if (inlinks == null) { return null; }
- return inlinks.get(key);
- }
- public void putToInlinks(Utf8 key, Utf8 value) {
- getStateManager().setDirty(this, 2);
- inlinks.put(key, value);
- }
- public Utf8 removeFromInlinks(Utf8 key) {
- if (inlinks == null) { return null; }
- getStateManager().setDirty(this, 2);
- return inlinks.remove(key);
+ public void put(int field$, Object value) {
+ switch (field$) {
+ case 0: __g__dirty = (java.nio.ByteBuffer)(value); break;
+ case 1: metadata = (java.util.Map<CharSequence,java.nio.ByteBuffer>)((value instanceof org.apache.gora.persistency.Dirtyable) ? value : new org.apache.gora.persistency.impl.DirtyMapWrapper((java.util.Map)value)); break;
+ case 2: outlinks = (java.util.Map<CharSequence,CharSequence>)((value instanceof org.apache.gora.persistency.Dirtyable) ? value : new org.apache.gora.persistency.impl.DirtyMapWrapper((java.util.Map)value)); break;
+ case 3: inlinks = (java.util.Map<CharSequence,CharSequence>)((value instanceof org.apache.gora.persistency.Dirtyable) ? value : new org.apache.gora.persistency.impl.DirtyMapWrapper((java.util.Map)value)); break;
+ default: throw new org.apache.avro.AvroRuntimeException("Bad index");
+ }
+ }
+
+ /**
+ * Gets the value of the 'metadata' field.
+ */
+ public java.util.Map<CharSequence,java.nio.ByteBuffer> getMetadata() {
+ return metadata;
+ }
+
+ /**
+ * Sets the value of the 'metadata' field.
+ * @param value the value to set.
+ */
+ public void setMetadata(java.util.Map<CharSequence,java.nio.ByteBuffer> value) {
+ this.metadata = (value instanceof org.apache.gora.persistency.Dirtyable) ? value : new org.apache.gora.persistency.impl.DirtyMapWrapper(value);
+ setDirty(1);
}
+ /**
+ * Checks the dirty status of the 'metadata' field. A field is dirty if it represents a change that has not yet been written to the database.
+ * @param value the value to set.
+ */
+ public boolean isMetadataDirty(java.util.Map<CharSequence,java.nio.ByteBuffer> value) {
+ return isDirty(1);
+ }
+
+ /**
+ * Gets the value of the 'outlinks' field.
+ */
+ public java.util.Map<CharSequence,CharSequence> getOutlinks() {
+ return outlinks;
+ }
+
+ /**
+ * Sets the value of the 'outlinks' field.
+ * @param value the value to set.
+ */
+ public void setOutlinks(java.util.Map<CharSequence,CharSequence> value) {
+ this.outlinks = (value instanceof org.apache.gora.persistency.Dirtyable) ? value : new org.apache.gora.persistency.impl.DirtyMapWrapper(value);
+ setDirty(2);
+ }
+
+ /**
+ * Checks the dirty status of the 'outlinks' field. A field is dirty if it represents a change that has not yet been written to the database.
+ * @param value the value to set.
+ */
+ public boolean isOutlinksDirty(java.util.Map<CharSequence,CharSequence> value) {
+ return isDirty(2);
+ }
+
+ /**
+ * Gets the value of the 'inlinks' field.
+ */
+ public java.util.Map<CharSequence,CharSequence> getInlinks() {
+ return inlinks;
+ }
+
+ /**
+ * Sets the value of the 'inlinks' field.
+ * @param value the value to set.
+ */
+ public void setInlinks(java.util.Map<CharSequence,CharSequence> value) {
+ this.inlinks = (value instanceof org.apache.gora.persistency.Dirtyable) ? value : new org.apache.gora.persistency.impl.DirtyMapWrapper(value);
+ setDirty(3);
+ }
+
+ /**
+ * Checks the dirty status of the 'inlinks' field. A field is dirty if it represents a change that has not yet been written to the database.
+ * @param value the value to set.
+ */
+ public boolean isInlinksDirty(java.util.Map<CharSequence,CharSequence> value) {
+ return isDirty(3);
+ }
+
public boolean contains(String key) {
return metadata.containsKey(new Utf8(key));
}
-
+
public String getValue(String key, String defaultValue) {
- if (!contains(key)) return defaultValue;
+ if (!contains(key))
+ return defaultValue;
return Bytes.toString(metadata.get(new Utf8(key)));
}
-
+
public int getInt(String key, int defaultValue) {
- if (!contains(key)) return defaultValue;
- return Integer.parseInt(getValue(key,null));
+ if (!contains(key))
+ return defaultValue;
+ return Integer.parseInt(getValue(key, null));
}
+
public long getLong(String key, long defaultValue) {
- if (!contains(key)) return defaultValue;
- return Long.parseLong(getValue(key,null));
+ if (!contains(key))
+ return defaultValue;
+ return Long.parseLong(getValue(key, null));
+ }
+
+ /** Creates a new Host RecordBuilder */
+ public static Builder newBuilder() {
+ return new Builder();
+ }
+
+ /** Creates a new Host RecordBuilder by copying an existing Builder */
+ public static Builder newBuilder(Builder other) {
+ return new Builder(other);
+ }
+
+ /** Creates a new Host RecordBuilder by copying an existing Host instance */
+ public static Builder newBuilder(Host other) {
+ return new Builder(other);
+ }
+
+ private static java.nio.ByteBuffer deepCopyToWriteOnlyBuffer(
+ java.nio.ByteBuffer input) {
+ java.nio.ByteBuffer copy = java.nio.ByteBuffer.allocate(input.capacity());
+ int position = input.position();
+ input.reset();
+ int mark = input.position();
+ int limit = input.limit();
+ input.rewind();
+ input.limit(input.capacity());
+ copy.put(input);
+ input.rewind();
+ copy.rewind();
+ input.position(mark);
+ input.mark();
+ copy.position(mark);
+ copy.mark();
+ input.position(position);
+ copy.position(position);
+ input.limit(limit);
+ copy.limit(limit);
+ return copy.asReadOnlyBuffer();
}
-}
+
+ /**
+ * RecordBuilder for Host instances.
+ */
+ public static class Builder extends org.apache.avro.specific.SpecificRecordBuilderBase<Host>
+ implements org.apache.avro.data.RecordBuilder<Host> {
+
+ private java.nio.ByteBuffer __g__dirty;
+ private java.util.Map<CharSequence,java.nio.ByteBuffer> metadata;
+ private java.util.Map<CharSequence,CharSequence> outlinks;
+ private java.util.Map<CharSequence,CharSequence> inlinks;
+
+ /** Creates a new Builder */
+ private Builder() {
+ super(Host.SCHEMA$);
+ }
+
+ /** Creates a Builder by copying an existing Builder */
+ private Builder(Builder other) {
+ super(other);
+ }
+
+ /** Creates a Builder by copying an existing Host instance */
+ private Builder(Host other) {
+ super(Host.SCHEMA$);
+ if (isValidValue(fields()[0], other.__g__dirty)) {
+ this.__g__dirty = (java.nio.ByteBuffer) data().deepCopy(fields()[0].schema(), other.__g__dirty);
+ fieldSetFlags()[0] = true;
+ }
+ if (isValidValue(fields()[1], other.metadata)) {
+ this.metadata = (java.util.Map<CharSequence,java.nio.ByteBuffer>) data().deepCopy(fields()[1].schema(), other.metadata);
+ fieldSetFlags()[1] = true;
+ }
+ if (isValidValue(fields()[2], other.outlinks)) {
+ this.outlinks = (java.util.Map<CharSequence,CharSequence>) data().deepCopy(fields()[2].schema(), other.outlinks);
+ fieldSetFlags()[2] = true;
+ }
+ if (isValidValue(fields()[3], other.inlinks)) {
+ this.inlinks = (java.util.Map<CharSequence,CharSequence>) data().deepCopy(fields()[3].schema(), other.inlinks);
+ fieldSetFlags()[3] = true;
+ }
+ }
+
+ /** Gets the value of the 'metadata' field */
+ public java.util.Map<CharSequence,java.nio.ByteBuffer> getMetadata() {
+ return metadata;
+ }
+
+ /** Sets the value of the 'metadata' field */
+ public Builder setMetadata(java.util.Map<CharSequence,java.nio.ByteBuffer> value) {
+ validate(fields()[1], value);
+ this.metadata = value;
+ fieldSetFlags()[1] = true;
+ return this;
+ }
+
+ /** Checks whether the 'metadata' field has been set */
+ public boolean hasMetadata() {
+ return fieldSetFlags()[1];
+ }
+
+ /** Clears the value of the 'metadata' field */
+ public Builder clearMetadata() {
+ metadata = null;
+ fieldSetFlags()[1] = false;
+ return this;
+ }
+
+ /** Gets the value of the 'outlinks' field */
+ public java.util.Map<CharSequence,CharSequence> getOutlinks() {
+ return outlinks;
+ }
+
+ /** Sets the value of the 'outlinks' field */
+ public Builder setOutlinks(java.util.Map<CharSequence,CharSequence> value) {
+ validate(fields()[2], value);
+ this.outlinks = value;
+ fieldSetFlags()[2] = true;
+ return this;
+ }
+
+ /** Checks whether the 'outlinks' field has been set */
+ public boolean hasOutlinks() {
+ return fieldSetFlags()[2];
+ }
+
+ /** Clears the value of the 'outlinks' field */
+ public Builder clearOutlinks() {
+ outlinks = null;
+ fieldSetFlags()[2] = false;
+ return this;
+ }
+
+ /** Gets the value of the 'inlinks' field */
+ public java.util.Map<CharSequence,CharSequence> getInlinks() {
+ return inlinks;
+ }
+
+ /** Sets the value of the 'inlinks' field */
+ public Builder setInlinks(java.util.Map<CharSequence,CharSequence> value) {
+ validate(fields()[3], value);
+ this.inlinks = value;
+ fieldSetFlags()[3] = true;
+ return this;
+ }
+
+ /** Checks whether the 'inlinks' field has been set */
+ public boolean hasInlinks() {
+ return fieldSetFlags()[3];
+ }
+
+ /** Clears the value of the 'inlinks' field */
+ public Builder clearInlinks() {
+ inlinks = null;
+ fieldSetFlags()[3] = false;
+ return this;
+ }
+
+ @Override
+ public Host build() {
+ try {
+ Host record = new Host();
+ record.__g__dirty = fieldSetFlags()[0] ? this.__g__dirty : (java.nio.ByteBuffer) java.nio.ByteBuffer.wrap(new byte[1]);
+ record.metadata = fieldSetFlags()[1] ? this.metadata : (java.util.Map<CharSequence,java.nio.ByteBuffer>) new org.apache.gora.persistency.impl.DirtyMapWrapper((java.util.Map)defaultValue(fields()[1]));
+ record.outlinks = fieldSetFlags()[2] ? this.outlinks : (java.util.Map<CharSequence,CharSequence>) new org.apache.gora.persistency.impl.DirtyMapWrapper((java.util.Map)defaultValue(fields()[2]));
+ record.inlinks = fieldSetFlags()[3] ? this.inlinks : (java.util.Map<CharSequence,CharSequence>) new org.apache.gora.persistency.impl.DirtyMapWrapper((java.util.Map)defaultValue(fields()[3]));
+ return record;
+ } catch (Exception e) {
+ throw new org.apache.avro.AvroRuntimeException(e);
+ }
+ }
+ }
+
+ public Tombstone getTombstone(){
+ return TOMBSTONE;
+ }
+
+ public Host newInstance(){
+ return newBuilder().build();
+ }
+
+ private static final Tombstone TOMBSTONE = new Tombstone();
+
+ public static final class Tombstone extends Host implements org.apache.gora.persistency.Tombstone {
+
+ private Tombstone() { }
+
+ /**
+ * Gets the value of the 'metadata' field.
+ */
+ public java.util.Map<CharSequence,java.nio.ByteBuffer> getMetadata() {
+ throw new UnsupportedOperationException("Get is not supported on tombstones");
+ }
+
+ /**
+ * Sets the value of the 'metadata' field.
+ * @param value the value to set.
+ */
+ public void setMetadata(java.util.Map<CharSequence,java.nio.ByteBuffer> value) {
+ throw new UnsupportedOperationException("Set is not supported on tombstones");
+ }
+
+ /**
+ * Checks the dirty status of the 'metadata' field. A field is dirty if it represents a change that has not yet been written to the database.
+ * @param value the value to set.
+ */
+ public boolean isMetadataDirty(java.util.Map<CharSequence,java.nio.ByteBuffer> value) {
+ throw new UnsupportedOperationException("IsDirty is not supported on tombstones");
+ }
+
+ /**
+ * Gets the value of the 'outlinks' field.
+ */
+ public java.util.Map<CharSequence,CharSequence> getOutlinks() {
+ throw new UnsupportedOperationException("Get is not supported on tombstones");
+ }
+
+ /**
+ * Sets the value of the 'outlinks' field.
+ * @param value the value to set.
+ */
+ public void setOutlinks(java.util.Map<CharSequence,CharSequence> value) {
+ throw new UnsupportedOperationException("Set is not supported on tombstones");
+ }
+
+ /**
+ * Checks the dirty status of the 'outlinks' field. A field is dirty if it represents a change that has not yet been written to the database.
+ * @param value the value to set.
+ */
+ public boolean isOutlinksDirty(java.util.Map<CharSequence,CharSequence> value) {
+ throw new UnsupportedOperationException("IsDirty is not supported on tombstones");
+ }
+
+ /**
+ * Gets the value of the 'inlinks' field.
+ */
+ public java.util.Map<CharSequence,CharSequence> getInlinks() {
+ throw new UnsupportedOperationException("Get is not supported on tombstones");
+ }
+
+ /**
+ * Sets the value of the 'inlinks' field.
+ * @param value the value to set.
+ */
+ public void setInlinks(java.util.Map<CharSequence,CharSequence> value) {
+ throw new UnsupportedOperationException("Set is not supported on tombstones");
+ }
+
+ /**
+ * Checks the dirty status of the 'inlinks' field. A field is dirty if it represents a change that has not yet been written to the database.
+ * @param value the value to set.
+ */
+ public boolean isInlinksDirty(java.util.Map<CharSequence,CharSequence> value) {
+ throw new UnsupportedOperationException("IsDirty is not supported on tombstones");
+ }
+
+
+ }
+
+}
\ No newline at end of file