You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2017/10/19 21:28:09 UTC
[nutch] 01/03: NUTCH-2435 - New parameter "parser.store.text"
allowing to choose whether to store 'parse_text' directory or not.
This is an automated email from the ASF dual-hosted git repository.
snagel pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/nutch.git
commit 7da8c704d1ea6dc892e32c8a9f9678affd1c7085
Author: Marcos Bori <ma...@album-online.com>
AuthorDate: Wed Sep 27 13:10:24 2017 +0200
NUTCH-2435 - New parameter "parser.store.text" allowing to choose whether to store 'parse_text' directory or not.
---
conf/nutch-default.xml | 7 +++++++
.../org/apache/nutch/parse/ParseOutputFormat.java | 24 +++++++++++++++-------
2 files changed, 24 insertions(+), 7 deletions(-)
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 6ddf964..ed0bb98 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1379,6 +1379,13 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
</description>
</property>
+<property>
+ <name>parser.store.text</name>
+ <value>true</value>
+ <description>If true (default value), parser will store parse text (parse_text directory within the segment).</description>
+</property>
+
+
<!--
<property>
<name>tika.htmlmapper.classname</name>
diff --git a/src/java/org/apache/nutch/parse/ParseOutputFormat.java b/src/java/org/apache/nutch/parse/ParseOutputFormat.java
index 6e84b12..b0778f3 100644
--- a/src/java/org/apache/nutch/parse/ParseOutputFormat.java
+++ b/src/java/org/apache/nutch/parse/ParseOutputFormat.java
@@ -111,6 +111,9 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> {
"db.ignore.external.links", false);
final String ignoreExternalLinksMode = job.get(
"db.ignore.external.links.mode", "byHost");
+ //NUTCH-2435 - parameter "parser.store.text" allowing to choose whether to store 'parse_text' directory or not:
+ final boolean storeText = job.getBoolean(
+ "parser.store.text", true);
int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100);
final boolean isParsing = job.getBoolean("fetcher.parse", true);
@@ -128,13 +131,18 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> {
.split(" *, *");
// textOut Options
- Option tKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
- org.apache.hadoop.io.SequenceFile.Writer.Option tValClassOpt = SequenceFile.Writer.valueClass(ParseText.class);
- org.apache.hadoop.io.SequenceFile.Writer.Option tProgressOpt = SequenceFile.Writer.progressable(progress);
- org.apache.hadoop.io.SequenceFile.Writer.Option tCompOpt = SequenceFile.Writer.compression(CompressionType.RECORD);
+ final MapFile.Writer textOut;
+ if (storeText) {
+ Option tKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
+ org.apache.hadoop.io.SequenceFile.Writer.Option tValClassOpt = SequenceFile.Writer.valueClass(ParseText.class);
+ org.apache.hadoop.io.SequenceFile.Writer.Option tProgressOpt = SequenceFile.Writer.progressable(progress);
+ org.apache.hadoop.io.SequenceFile.Writer.Option tCompOpt = SequenceFile.Writer.compression(CompressionType.RECORD);
- final MapFile.Writer textOut = new MapFile.Writer(job, text,
+ textOut = new MapFile.Writer(job, text,
tKeyClassOpt, tValClassOpt, tCompOpt, tProgressOpt);
+ } else {
+ textOut=null;
+ }
// dataOut Options
Option dKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
@@ -162,7 +170,9 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> {
String fromUrl = key.toString();
// host or domain name of the source URL
String origin = null;
- textOut.append(key, new ParseText(parse.getText()));
+ if (textOut!=null) {
+ textOut.append(key, new ParseText(parse.getText()));
+ }
ParseData parseData = parse.getData();
// recover the signature prepared by Fetcher or ParseSegment
@@ -311,7 +321,7 @@ public class ParseOutputFormat implements OutputFormat<Text, Parse> {
}
public void close(Reporter reporter) throws IOException {
- textOut.close();
+ if (textOut!=null) textOut.close();
dataOut.close();
crawlOut.close();
}
--
To stop receiving notification emails like this one, please contact
"commits@nutch.apache.org" <co...@nutch.apache.org>.