You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by le...@apache.org on 2015/01/29 06:39:03 UTC
svn commit: r1655526 [11/26] - in /nutch/trunk: ./
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/
src/java/org/apache/nutch/net/ src/java/org/apache/nutch/net/p...
Modified: nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java Thu Jan 29 05:38:59 2015
@@ -35,16 +35,15 @@ import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
-
/** Utility that converts DMOZ RDF into a flat file of URLs to be injected. */
public class DmozParser {
public static final Logger LOG = LoggerFactory.getLogger(DmozParser.class);
-
- long pages = 0;
+
+ long pages = 0;
/**
- * This filter fixes characters that might offend our parser.
- * This lets us be tolerant of errors that might appear in the input XML.
+ * This filter fixes characters that might offend our parser. This lets us be
+ * tolerant of errors that might appear in the input XML.
*/
private static class XMLCharFilter extends FilterReader {
private boolean lastBad = false;
@@ -56,9 +55,9 @@ public class DmozParser {
public int read() throws IOException {
int c = in.read();
int value = c;
- if (c != -1 && !(XMLChar.isValid(c))) // fix invalid characters
+ if (c != -1 && !(XMLChar.isValid(c))) // fix invalid characters
value = 'X';
- else if (lastBad && c == '<') { // fix mis-matched brackets
+ else if (lastBad && c == '<') { // fix mis-matched brackets
in.mark(1);
if (in.read() != '/')
value = 'X';
@@ -69,37 +68,35 @@ public class DmozParser {
return value;
}
- public int read(char[] cbuf, int off, int len)
- throws IOException {
+ public int read(char[] cbuf, int off, int len) throws IOException {
int n = in.read(cbuf, off, len);
if (n != -1) {
for (int i = 0; i < n; i++) {
- char c = cbuf[off+i];
+ char c = cbuf[off + i];
char value = c;
- if (!(XMLChar.isValid(c))) // fix invalid characters
+ if (!(XMLChar.isValid(c))) // fix invalid characters
value = 'X';
- else if (lastBad && c == '<') { // fix mis-matched brackets
- if (i != n-1 && cbuf[off+i+1] != '/')
+ else if (lastBad && c == '<') { // fix mis-matched brackets
+ if (i != n - 1 && cbuf[off + i + 1] != '/')
value = 'X';
}
lastBad = (c == 65533);
- cbuf[off+i] = value;
+ cbuf[off + i] = value;
}
}
return n;
}
}
-
/**
- * The RDFProcessor receives tag messages during a parse
- * of RDF XML data. We build whatever structures we need
- * from these messages.
+ * The RDFProcessor receives tag messages during a parse of RDF XML data. We
+ * build whatever structures we need from these messages.
*/
private class RDFProcessor extends DefaultHandler {
String curURL = null, curSection = null;
- boolean titlePending = false, descPending = false, insideAdultSection = false;
- Pattern topicPattern = null;
+ boolean titlePending = false, descPending = false,
+ insideAdultSection = false;
+ Pattern topicPattern = null;
StringBuffer title = new StringBuffer(), desc = new StringBuffer();
XMLReader reader;
int subsetDenom;
@@ -108,10 +105,12 @@ public class DmozParser {
Locator location;
/**
- * Pass in an XMLReader, plus a flag as to whether we
- * should include adult material.
+ * Pass in an XMLReader, plus a flag as to whether we should include adult
+ * material.
*/
- public RDFProcessor(XMLReader reader, int subsetDenom, boolean includeAdult, int skew, Pattern topicPattern) throws IOException {
+ public RDFProcessor(XMLReader reader, int subsetDenom,
+ boolean includeAdult, int skew, Pattern topicPattern)
+ throws IOException {
this.reader = reader;
this.subsetDenom = subsetDenom;
this.includeAdult = includeAdult;
@@ -127,20 +126,21 @@ public class DmozParser {
/**
* Start of an XML elt
*/
- public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
+ public void startElement(String namespaceURI, String localName,
+ String qName, Attributes atts) throws SAXException {
if ("Topic".equals(qName)) {
curSection = atts.getValue("r:id");
} else if ("ExternalPage".equals(qName)) {
// Porn filter
- if ((! includeAdult) && curSection.startsWith("Top/Adult")) {
+ if ((!includeAdult) && curSection.startsWith("Top/Adult")) {
return;
}
-
+
if (topicPattern != null && !topicPattern.matcher(curSection).matches()) {
return;
}
- // Subset denominator filter.
+ // Subset denominator filter.
// Only emit with a chance of 1/denominator.
String url = atts.getValue("about");
int hashValue = MD5Hash.digest(url).hashCode();
@@ -173,18 +173,18 @@ public class DmozParser {
* Termination of XML elt
*/
public void endElement(String namespaceURI, String localName, String qName)
- throws SAXException {
+ throws SAXException {
if (curURL != null) {
if ("ExternalPage".equals(qName)) {
//
- // Inc the number of pages, insert the page, and
+ // Inc the number of pages, insert the page, and
// possibly print status.
//
- System.out.println(curURL);
+ System.out.println(curURL);
pages++;
//
- // Clear out the link text. This is what
+ // Clear out the link text. This is what
// you would use for adding to the linkdb.
//
if (title.length() > 0) {
@@ -219,15 +219,13 @@ public class DmozParser {
}
/**
- * From time to time the Parser will set the "current location"
- * by calling this function. It's useful for emitting locations
- * for error messages.
+ * From time to time the Parser will set the "current location" by calling
+ * this function. It's useful for emitting locations for error messages.
*/
public void setDocumentLocator(Locator locator) {
location = locator;
}
-
//
// Interface ErrorHandler
//
@@ -247,11 +245,11 @@ public class DmozParser {
public void errorError(SAXParseException spe) {
if (LOG.isErrorEnabled()) {
LOG.error("Fatal err: " + spe.toString() + ": " + spe.getMessage());
- LOG.error("Last known line is " + location.getLineNumber() +
- ", column " + location.getColumnNumber());
+ LOG.error("Last known line is " + location.getLineNumber()
+ + ", column " + location.getColumnNumber());
}
}
-
+
/**
* Emit exception warning message
*/
@@ -263,34 +261,33 @@ public class DmozParser {
}
/**
- * Iterate through all the items in this structured DMOZ file.
- * Add each URL to the web db.
+ * Iterate through all the items in this structured DMOZ file. Add each URL to
+ * the web db.
*/
public void parseDmozFile(File dmozFile, int subsetDenom,
- boolean includeAdult,
- int skew,
- Pattern topicPattern)
+ boolean includeAdult, int skew, Pattern topicPattern)
- throws IOException, SAXException, ParserConfigurationException {
+ throws IOException, SAXException, ParserConfigurationException {
SAXParserFactory parserFactory = SAXParserFactory.newInstance();
SAXParser parser = parserFactory.newSAXParser();
XMLReader reader = parser.getXMLReader();
// Create our own processor to receive SAX events
- RDFProcessor rp =
- new RDFProcessor(reader, subsetDenom, includeAdult,
- skew, topicPattern);
+ RDFProcessor rp = new RDFProcessor(reader, subsetDenom, includeAdult, skew,
+ topicPattern);
reader.setContentHandler(rp);
reader.setErrorHandler(rp);
LOG.info("skew = " + rp.hashSkew);
//
- // Open filtered text stream. The TextFilter makes sure that
+ // Open filtered text stream. The TextFilter makes sure that
// only appropriate XML-approved Text characters are received.
// Any non-conforming characters are silently skipped.
//
- XMLCharFilter in = new XMLCharFilter(new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(dmozFile)), "UTF-8")));
+ XMLCharFilter in = new XMLCharFilter(new BufferedReader(
+ new InputStreamReader(new BufferedInputStream(new FileInputStream(
+ dmozFile)), "UTF-8")));
try {
InputSource is = new InputSource(in);
reader.parse(is);
@@ -304,18 +301,17 @@ public class DmozParser {
}
}
- private static void addTopicsFromFile(String topicFile,
- Vector<String> topics)
- throws IOException {
+ private static void addTopicsFromFile(String topicFile, Vector<String> topics)
+ throws IOException {
BufferedReader in = null;
try {
- in = new BufferedReader(new InputStreamReader(new FileInputStream(topicFile), "UTF-8"));
+ in = new BufferedReader(new InputStreamReader(new FileInputStream(
+ topicFile), "UTF-8"));
String line = null;
while ((line = in.readLine()) != null) {
topics.addElement(new String(line));
}
- }
- catch (Exception e) {
+ } catch (Exception e) {
if (LOG.isErrorEnabled()) {
LOG.error(e.toString());
}
@@ -324,18 +320,19 @@ public class DmozParser {
in.close();
}
}
-
+
/**
- * Command-line access. User may add URLs via a flat text file
- * or the structured DMOZ file. By default, we ignore Adult
- * material (as categorized by DMOZ).
+ * Command-line access. User may add URLs via a flat text file or the
+ * structured DMOZ file. By default, we ignore Adult material (as categorized
+ * by DMOZ).
*/
public static void main(String argv[]) throws Exception {
if (argv.length < 1) {
- System.err.println("Usage: DmozParser <dmoz_file> [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]");
+ System.err
+ .println("Usage: DmozParser <dmoz_file> [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]");
return;
}
-
+
//
// Parse the command line, figure out what kind of
// URL file we need to load
@@ -344,9 +341,9 @@ public class DmozParser {
int skew = 0;
String dmozFile = argv[0];
boolean includeAdult = false;
- Pattern topicPattern = null;
+ Pattern topicPattern = null;
Vector<String> topics = new Vector<String>();
-
+
Configuration conf = NutchConfiguration.create();
FileSystem fs = FileSystem.get(conf);
try {
@@ -354,16 +351,16 @@ public class DmozParser {
if ("-includeAdultMaterial".equals(argv[i])) {
includeAdult = true;
} else if ("-subset".equals(argv[i])) {
- subsetDenom = Integer.parseInt(argv[i+1]);
+ subsetDenom = Integer.parseInt(argv[i + 1]);
i++;
} else if ("-topic".equals(argv[i])) {
- topics.addElement(argv[i+1]);
+ topics.addElement(argv[i + 1]);
i++;
} else if ("-topicFile".equals(argv[i])) {
- addTopicsFromFile(argv[i+1], topics);
+ addTopicsFromFile(argv[i + 1], topics);
i++;
} else if ("-skew".equals(argv[i])) {
- skew = Integer.parseInt(argv[i+1]);
+ skew = Integer.parseInt(argv[i + 1]);
i++;
}
}
@@ -371,21 +368,21 @@ public class DmozParser {
DmozParser parser = new DmozParser();
if (!topics.isEmpty()) {
- String regExp = new String("^(");
+ String regExp = new String("^(");
int j = 0;
- for ( ; j < topics.size() - 1; ++j) {
+ for (; j < topics.size() - 1; ++j) {
regExp = regExp.concat(topics.get(j));
regExp = regExp.concat("|");
}
regExp = regExp.concat(topics.get(j));
- regExp = regExp.concat(").*");
+ regExp = regExp.concat(").*");
LOG.info("Topic selection pattern = " + regExp);
- topicPattern = Pattern.compile(regExp);
+ topicPattern = Pattern.compile(regExp);
}
- parser.parseDmozFile(new File(dmozFile), subsetDenom,
- includeAdult, skew, topicPattern);
-
+ parser.parseDmozFile(new File(dmozFile), subsetDenom, includeAdult, skew,
+ topicPattern);
+
} finally {
fs.close();
}
Modified: nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FileDumper.java Thu Jan 29 05:38:59 2015
@@ -54,20 +54,26 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
- * <p>The file dumper tool enables one to reverse generate the raw content
- * from Nutch segment data directories. </p>
+ * <p>
+ * The file dumper tool enables one to reverse generate the raw content from
+ * Nutch segment data directories.
+ * </p>
* <p>
* The tool has a number of immediate uses:
* <ol>
* <li>one can see what a page looked like at the time it was crawled</li>
* <li>one can see different media types acquired as part of the crawl</li>
- * <li>it enables us to see webpages before we augment them with additional metadata,
- * this can be handy for providing a provenance trail for your crawl data.</li>
+ * <li>it enables us to see webpages before we augment them with additional
+ * metadata, this can be handy for providing a provenance trail for your crawl
+ * data.</li>
* </ol>
* </p>
- * <p>Upon successful completion the tool displays a very convenient JSON snippet
- * detailing the mimetype classifications and the counts of documents which
- * fall into those classifications. An example is as follows:</p>
+ * <p>
+ * Upon successful completion the tool displays a very convenient JSON snippet
+ * detailing the mimetype classifications and the counts of documents which fall
+ * into those classifications. An example is as follows:
+ * </p>
+ *
* <pre>
* {@code
* INFO: File Types:
@@ -92,45 +98,53 @@ import org.slf4j.LoggerFactory;
* }
* }
* </pre>
- * <p>In the case above the tool would have been run with the <b>-mimeType
- * image/png image/jpeg image/vnd.microsoft.icon video/quicktime image/gif</b>
+ * <p>
+ * In the case above the tool would have been run with the <b>-mimeType
+ * image/png image/jpeg image/vnd.microsoft.icon video/quicktime image/gif</b>
* flag and corresponding values activated.
- *
+ *
*/
public class FileDumper {
private static final Logger LOG = LoggerFactory.getLogger(FileDumper.class
.getName());
-
/**
- * Dumps the reverse engineered raw content from the provided segment directories
- * if a parent directory contains more than one segment, otherwise a single segment
- * can be passed as an argument.
- * @param outputDir the directory you wish to dump the raw content to. This directory will be created.
- * @param segmentRootDir a directory containing one or more segments.
- * @param mimeTypes an array of mime types we have to dump, all others will be filtered out.
+ * Dumps the reverse engineered raw content from the provided segment
+ * directories if a parent directory contains more than one segment, otherwise
+ * a single segment can be passed as an argument.
+ *
+ * @param outputDir
+ * the directory you wish to dump the raw content to. This directory
+ * will be created.
+ * @param segmentRootDir
+ * a directory containing one or more segments.
+ * @param mimeTypes
+ * an array of mime types we have to dump, all others will be
+ * filtered out.
* @throws Exception
*/
- public void dump(File outputDir, File segmentRootDir, String[] mimeTypes) throws Exception {
- if (mimeTypes == null) LOG.info("Accepting all mimetypes.");
- //total file counts
+ public void dump(File outputDir, File segmentRootDir, String[] mimeTypes)
+ throws Exception {
+ if (mimeTypes == null)
+ LOG.info("Accepting all mimetypes.");
+ // total file counts
Map<String, Integer> typeCounts = new HashMap<String, Integer>();
- //filtered file counts
+ // filtered file counts
Map<String, Integer> filteredCounts = new HashMap<String, Integer>();
Configuration conf = NutchConfiguration.create();
FileSystem fs = FileSystem.get(conf);
int fileCount = 0;
- File[] segmentDirs = segmentRootDir
- .listFiles(new FileFilter() {
+ File[] segmentDirs = segmentRootDir.listFiles(new FileFilter() {
- @Override
- public boolean accept(File file) {
- return file.canRead() && file.isDirectory();
- }
- });
+ @Override
+ public boolean accept(File file) {
+ return file.canRead() && file.isDirectory();
+ }
+ });
if (segmentDirs == null) {
- System.err.println("No segment directories found in [" + segmentRootDir.getAbsolutePath() + "]");
+ System.err.println("No segment directories found in ["
+ + segmentRootDir.getAbsolutePath() + "]");
return;
}
@@ -138,18 +152,17 @@ public class FileDumper {
LOG.info("Processing segment: [" + segment.getAbsolutePath() + "]");
DataOutputStream doutputStream = null;
try {
- String segmentPath = segment.getAbsolutePath()
- + "/" + Content.DIR_NAME + "/part-00000/data";
+ String segmentPath = segment.getAbsolutePath() + "/" + Content.DIR_NAME
+ + "/part-00000/data";
Path file = new Path(segmentPath);
if (!new File(file.toString()).exists()) {
LOG.warn("Skipping segment: [" + segmentPath
+ "]: no data directory present");
continue;
}
- SequenceFile.Reader reader = new SequenceFile.Reader(fs, file,
- conf);
+ SequenceFile.Reader reader = new SequenceFile.Reader(fs, file, conf);
- Writable key = (Writable)reader.getKeyClass().newInstance();
+ Writable key = (Writable) reader.getKeyClass().newInstance();
Content content = null;
while (reader.next(key)) {
@@ -158,35 +171,33 @@ public class FileDumper {
String url = key.toString();
String baseName = FilenameUtils.getBaseName(url);
String extension = FilenameUtils.getExtension(url);
- if (extension == null || (extension != null &&
- extension.equals(""))){
+ if (extension == null || (extension != null && extension.equals(""))) {
extension = "html";
}
String filename = baseName + "." + extension;
ByteArrayInputStream bas = null;
Boolean filter = false;
- try{
+ try {
bas = new ByteArrayInputStream(content.getContent());
String mimeType = new Tika().detect(content.getContent());
collectStats(typeCounts, mimeType);
if (mimeType != null) {
- if (mimeTypes == null || Arrays.asList(mimeTypes).contains(mimeType)) {
+ if (mimeTypes == null
+ || Arrays.asList(mimeTypes).contains(mimeType)) {
collectStats(filteredCounts, mimeType);
filter = true;
}
}
- }
- catch(Exception e){
+ } catch (Exception e) {
e.printStackTrace();
- LOG.warn("Tika is unable to detect type for: ["+url+"]");
- }
- finally{
- if(bas != null){
- try{
+ LOG.warn("Tika is unable to detect type for: [" + url + "]");
+ } finally {
+ if (bas != null) {
+ try {
bas.close();
+ } catch (Exception ignore) {
}
- catch(Exception ignore){}
}
}
@@ -199,51 +210,58 @@ public class FileDumper {
IOUtils.write(content.getContent(), output);
fileCount++;
} else {
- LOG.info("Skipping writing: ["
- + outputFullPath + "]: file already exists");
+ LOG.info("Skipping writing: [" + outputFullPath
+ + "]: file already exists");
}
}
}
reader.close();
- }
- finally {
+ } finally {
fs.close();
- if (doutputStream != null){
- try{
+ if (doutputStream != null) {
+ try {
doutputStream.close();
+ } catch (Exception ignore) {
}
- catch (Exception ignore){}
}
}
}
- LOG.info("Dumper File Stats: " + displayFileTypes(typeCounts, filteredCounts));
+ LOG.info("Dumper File Stats: "
+ + displayFileTypes(typeCounts, filteredCounts));
}
/**
* Main method for invoking this tool
- * @param args 1) output directory (which will be created) to host the
- * raw data and 2) a directory containing one or more segments.
+ *
+ * @param args
+ * 1) output directory (which will be created) to host the raw data
+ * and 2) a directory containing one or more segments.
* @throws Exception
*/
public static void main(String[] args) throws Exception {
- //boolean options
+ // boolean options
Option helpOpt = new Option("h", "help", false, "show this help message");
- //argument options
+ // argument options
@SuppressWarnings("static-access")
- Option outputOpt = OptionBuilder.withArgName("outputDir")
- .hasArg().withDescription("output directory (which will be created) to host the raw data")
- .create("outputDir");
+ Option outputOpt = OptionBuilder
+ .withArgName("outputDir")
+ .hasArg()
+ .withDescription(
+ "output directory (which will be created) to host the raw data")
+ .create("outputDir");
@SuppressWarnings("static-access")
- Option segOpt = OptionBuilder.withArgName("segment")
- .hasArgs().withDescription("the segment(s) to use")
- .create("segment");
+ Option segOpt = OptionBuilder.withArgName("segment").hasArgs()
+ .withDescription("the segment(s) to use").create("segment");
@SuppressWarnings("static-access")
- Option mimeOpt = OptionBuilder.withArgName("mimetype")
- .hasArgs().withDescription("an optional list of mimetypes to dump, excluding all others. Defaults to all.")
- .create("mimetype");
+ Option mimeOpt = OptionBuilder
+ .withArgName("mimetype")
+ .hasArgs()
+ .withDescription(
+ "an optional list of mimetypes to dump, excluding all others. Defaults to all.")
+ .create("mimetype");
- //create the options
+ // create the options
Options options = new Options();
options.addOption(helpOpt);
options.addOption(outputOpt);
@@ -267,13 +285,14 @@ public class FileDumper {
if (!outputDir.exists()) {
LOG.warn("Output directory: [" + outputDir.getAbsolutePath()
+ "]: does not exist, creating it.");
- if(!outputDir.mkdirs()) throw new Exception("Unable to create: ["+outputDir.getAbsolutePath()+"]");
+ if (!outputDir.mkdirs())
+ throw new Exception("Unable to create: ["
+ + outputDir.getAbsolutePath() + "]");
}
FileDumper dumper = new FileDumper();
dumper.dump(outputDir, segmentRootDir, mimeTypes);
- }
- catch(Exception e) {
+ } catch (Exception e) {
LOG.error("FileDumper: " + StringUtils.stringifyException(e));
e.printStackTrace();
return;
@@ -282,13 +301,13 @@ public class FileDumper {
private void collectStats(Map<String, Integer> typeCounts, String mimeType) {
typeCounts.put(mimeType,
- typeCounts.containsKey(mimeType) ? typeCounts.get(mimeType) + 1
- : 1);
+ typeCounts.containsKey(mimeType) ? typeCounts.get(mimeType) + 1 : 1);
}
- private String displayFileTypes(Map<String, Integer> typeCounts, Map<String, Integer> filteredCounts) {
- StringBuilder builder = new StringBuilder();
- //print total stats
+ private String displayFileTypes(Map<String, Integer> typeCounts,
+ Map<String, Integer> filteredCounts) {
+ StringBuilder builder = new StringBuilder();
+ // print total stats
builder.append("\n TOTAL Stats:\n");
builder.append(" {\n");
for (String mimeType : typeCounts.keySet()) {
Modified: nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java Thu Jan 29 05:38:59 2015
@@ -54,19 +54,20 @@ import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
/**
- * This tool generates fetchlists (segments to be fetched) from plain text
- * files containing one URL per line. It's useful when arbitrary URL-s need to
- * be fetched without adding them first to the CrawlDb, or during testing.
+ * This tool generates fetchlists (segments to be fetched) from plain text files
+ * containing one URL per line. It's useful when arbitrary URL-s need to be
+ * fetched without adding them first to the CrawlDb, or during testing.
*/
public class FreeGenerator extends Configured implements Tool {
- private static final Logger LOG = LoggerFactory.getLogger(FreeGenerator.class);
-
+ private static final Logger LOG = LoggerFactory
+ .getLogger(FreeGenerator.class);
+
private static final String FILTER_KEY = "free.generator.filter";
private static final String NORMALIZE_KEY = "free.generator.normalize";
- public static class FG extends MapReduceBase
- implements Mapper<WritableComparable<?>, Text, Text, Generator.SelectorEntry>,
- Reducer<Text, Generator.SelectorEntry, Text, CrawlDatum> {
+ public static class FG extends MapReduceBase implements
+ Mapper<WritableComparable<?>, Text, Text, Generator.SelectorEntry>,
+ Reducer<Text, Generator.SelectorEntry, Text, CrawlDatum> {
private URLNormalizers normalizers = null;
private URLFilters filters = null;
private ScoringFilters scfilters;
@@ -89,13 +90,15 @@ public class FreeGenerator extends Confi
Generator.SelectorEntry entry = new Generator.SelectorEntry();
- public void map(WritableComparable<?> key, Text value, OutputCollector<Text,
- Generator.SelectorEntry> output, Reporter reporter) throws IOException {
+ public void map(WritableComparable<?> key, Text value,
+ OutputCollector<Text, Generator.SelectorEntry> output, Reporter reporter)
+ throws IOException {
// value is a line of text
String urlString = value.toString();
try {
if (normalizers != null) {
- urlString = normalizers.normalize(urlString, URLNormalizers.SCOPE_INJECT);
+ urlString = normalizers.normalize(urlString,
+ URLNormalizers.SCOPE_INJECT);
}
if (urlString != null && filters != null) {
urlString = filters.filter(urlString);
@@ -105,7 +108,8 @@ public class FreeGenerator extends Confi
scfilters.injectedScore(url, datum);
}
} catch (Exception e) {
- LOG.warn("Error adding url '" + value.toString() + "', skipping: " + StringUtils.stringifyException(e));
+ LOG.warn("Error adding url '" + value.toString() + "', skipping: "
+ + StringUtils.stringifyException(e));
return;
}
if (urlString == null) {
@@ -122,8 +126,10 @@ public class FreeGenerator extends Confi
}
public void reduce(Text key, Iterator<Generator.SelectorEntry> values,
- OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException {
- // pick unique urls from values - discard the reduce key due to hash collisions
+ OutputCollector<Text, CrawlDatum> output, Reporter reporter)
+ throws IOException {
+ // pick unique urls from values - discard the reduce key due to hash
+ // collisions
HashMap<Text, CrawlDatum> unique = new HashMap<Text, CrawlDatum>();
while (values.hasNext()) {
Generator.SelectorEntry entry = values.next();
@@ -138,12 +144,17 @@ public class FreeGenerator extends Confi
public int run(String[] args) throws Exception {
if (args.length < 2) {
- System.err.println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]");
- System.err.println("\tinputDir\tinput directory containing one or more input files.");
- System.err.println("\t\tEach text file contains a list of URLs, one URL per line");
- System.err.println("\tsegmentsDir\toutput directory, where new segment will be created");
+ System.err
+ .println("Usage: FreeGenerator <inputDir> <segmentsDir> [-filter] [-normalize]");
+ System.err
+ .println("\tinputDir\tinput directory containing one or more input files.");
+ System.err
+ .println("\t\tEach text file contains a list of URLs, one URL per line");
+ System.err
+ .println("\tsegmentsDir\toutput directory, where new segment will be created");
System.err.println("\t-filter\trun current URLFilters on input URLs");
- System.err.println("\t-normalize\trun current URLNormalizers on input URLs");
+ System.err
+ .println("\t-normalize\trun current URLNormalizers on input URLs");
return -1;
}
boolean filter = false;
@@ -181,8 +192,8 @@ public class FreeGenerator extends Confi
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(CrawlDatum.class);
job.setOutputKeyComparatorClass(Generator.HashComparator.class);
- FileOutputFormat.setOutputPath(job, new Path(args[1],
- new Path(segName, CrawlDatum.GENERATE_DIR_NAME)));
+ FileOutputFormat.setOutputPath(job, new Path(args[1], new Path(segName,
+ CrawlDatum.GENERATE_DIR_NAME)));
try {
JobClient.runJob(job);
} catch (Exception e) {
@@ -190,12 +201,14 @@ public class FreeGenerator extends Confi
return -1;
}
long end = System.currentTimeMillis();
- LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+ LOG.info("FreeGenerator: finished at " + sdf.format(end) + ", elapsed: "
+ + TimingUtil.elapsedTime(start, end));
return 0;
}
public static void main(String[] args) throws Exception {
- int res = ToolRunner.run(NutchConfiguration.create(), new FreeGenerator(), args);
+ int res = ToolRunner.run(NutchConfiguration.create(), new FreeGenerator(),
+ args);
System.exit(res);
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java Thu Jan 29 05:38:59 2015
@@ -59,8 +59,7 @@ public class ResolveUrls {
/**
* A Thread which gets the ip address of a single host by name.
*/
- private static class ResolverThread
- extends Thread {
+ private static class ResolverThread extends Thread {
private String url = null;
@@ -74,14 +73,13 @@ public class ResolveUrls {
String host = URLUtil.getHost(url);
long start = System.currentTimeMillis();
try {
-
- // get the address by name and if no error is thrown then it
+
+ // get the address by name and if no error is thrown then it
// is resolved successfully
InetAddress.getByName(host);
LOG.info("Resolved: " + host);
numResolved.incrementAndGet();
- }
- catch (Exception uhe) {
+ } catch (Exception uhe) {
LOG.info("Error Resolving: " + host);
numErrored.incrementAndGet();
}
@@ -93,8 +91,8 @@ public class ResolveUrls {
}
/**
- * Creates a thread pool for resolving urls. Reads in the url file on the
- * local filesystem. For each url it attempts to resolve it keeping a total
+ * Creates a thread pool for resolving urls. Reads in the url file on the
+ * local filesystem. For each url it attempts to resolve it keeping a total
* account of the number resolved, errored, and the amount of time.
*/
public void resolveUrls() {
@@ -103,13 +101,13 @@ public class ResolveUrls {
// create a thread pool with a fixed number of threads
pool = Executors.newFixedThreadPool(numThreads);
-
+
// read in the urls file and loop through each line, one url per line
BufferedReader buffRead = new BufferedReader(new FileReader(new File(
- urlsFile)));
+ urlsFile)));
String urlStr = null;
while ((urlStr = buffRead.readLine()) != null) {
-
+
// spin up a resolver thread per url
LOG.info("Starting: " + urlStr);
pool.execute(new ResolverThread(urlStr));
@@ -119,9 +117,8 @@ public class ResolveUrls {
// the thread pool to give urls time to finish resolving
buffRead.close();
pool.awaitTermination(60, TimeUnit.SECONDS);
- }
- catch (Exception e) {
-
+ } catch (Exception e) {
+
// on error shutdown the thread pool immediately
pool.shutdownNow();
LOG.info(StringUtils.stringifyException(e));
@@ -129,15 +126,16 @@ public class ResolveUrls {
// shutdown the thread pool and log totals
pool.shutdown();
- LOG.info("Total: " + numTotal.get() + ", Resovled: "
- + numResolved.get() + ", Errored: " + numErrored.get()
- + ", Average Time: " + totalTime.get() / numTotal.get());
+ LOG.info("Total: " + numTotal.get() + ", Resovled: " + numResolved.get()
+ + ", Errored: " + numErrored.get() + ", Average Time: "
+ + totalTime.get() / numTotal.get());
}
/**
* Create a new ResolveUrls with a file from the local file system.
- *
- * @param urlsFile The local urls file, one url per line.
+ *
+ * @param urlsFile
+ * The local urls file, one url per line.
*/
public ResolveUrls(String urlsFile) {
this(urlsFile, 100);
@@ -145,10 +143,12 @@ public class ResolveUrls {
/**
* Create a new ResolveUrls with a urls file and a number of threads for the
- * Thread pool. Number of threads is 100 by default.
+ * Thread pool. Number of threads is 100 by default.
*
- * @param urlsFile The local urls file, one url per line.
- * @param numThreads The number of threads used to resolve urls in parallel.
+ * @param urlsFile
+ * The local urls file, one url per line.
+ * @param numThreads
+ * The number of threads used to resolve urls in parallel.
*/
public ResolveUrls(String urlsFile, int numThreads) {
this.urlsFile = urlsFile;
@@ -165,13 +165,13 @@ public class ResolveUrls {
OptionBuilder.withDescription("show this help message");
Option helpOpts = OptionBuilder.create("help");
options.addOption(helpOpts);
-
+
OptionBuilder.withArgName("urls");
OptionBuilder.hasArg();
OptionBuilder.withDescription("the urls file to check");
Option urlOpts = OptionBuilder.create("urls");
options.addOption(urlOpts);
-
+
OptionBuilder.withArgName("numThreads");
OptionBuilder.hasArgs();
OptionBuilder.withDescription("the number of threads to use");
@@ -197,8 +197,7 @@ public class ResolveUrls {
}
ResolveUrls resolve = new ResolveUrls(urls, numThreads);
resolve.resolveUrls();
- }
- catch (Exception e) {
+ } catch (Exception e) {
LOG.error("ResolveUrls: " + StringUtils.stringifyException(e));
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java Thu Jan 29 05:38:59 2015
@@ -30,21 +30,22 @@ import org.apache.hadoop.mapred.Reporter
/**
* A input format the reads arc files.
*/
-public class ArcInputFormat
- extends FileInputFormat<Text, BytesWritable> {
+public class ArcInputFormat extends FileInputFormat<Text, BytesWritable> {
/**
* Returns the <code>RecordReader</code> for reading the arc file.
*
- * @param split The InputSplit of the arc file to process.
- * @param job The job configuration.
- * @param reporter The progress reporter.
+ * @param split
+ * The InputSplit of the arc file to process.
+ * @param job
+ * The job configuration.
+ * @param reporter
+ * The progress reporter.
*/
public RecordReader<Text, BytesWritable> getRecordReader(InputSplit split,
- JobConf job, Reporter reporter)
- throws IOException {
+ JobConf job, Reporter reporter) throws IOException {
reporter.setStatus(split.toString());
- return new ArcRecordReader(job, (FileSplit)split);
+ return new ArcRecordReader(job, (FileSplit) split);
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java Thu Jan 29 05:38:59 2015
@@ -34,23 +34,28 @@ import org.apache.hadoop.util.Reflection
import org.apache.hadoop.util.StringUtils;
/**
- * <p>The <code>ArchRecordReader</code> class provides a record reader which
- * reads records from arc files.</p>
+ * <p>
+ * The <code>ArchRecordReader</code> class provides a record reader which reads
+ * records from arc files.
+ * </p>
*
- * <p>Arc files are essentially tars of gzips. Each record in an arc file is
- * a compressed gzip. Multiple records are concatenated together to form a
- * complete arc. For more information on the arc file format see
- * {@link http://www.archive.org/web/researcher/ArcFileFormat.php } .</p>
+ * <p>
+ * Arc files are essentially tars of gzips. Each record in an arc file is a
+ * compressed gzip. Multiple records are concatenated together to form a
+ * complete arc. For more information on the arc file format see {@link http
+ * ://www.archive.org/web/researcher/ArcFileFormat.php } .
+ * </p>
*
- * <p>Arc files are used by the internet archive and grub projects.</p>
+ * <p>
+ * Arc files are used by the internet archive and grub projects.
+ * </p>
*
- * see {@link http://www.archive.org/ }
- * see {@link http://www.grub.org/ }
+ * see {@link http://www.archive.org/ } see {@link http://www.grub.org/ }
*/
-public class ArcRecordReader
- implements RecordReader<Text, BytesWritable> {
+public class ArcRecordReader implements RecordReader<Text, BytesWritable> {
- public static final Logger LOG = LoggerFactory.getLogger(ArcRecordReader.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(ArcRecordReader.class);
protected Configuration conf;
protected long splitStart = 0;
@@ -60,30 +65,32 @@ public class ArcRecordReader
protected long fileLen = 0;
protected FSDataInputStream in;
- private static byte[] MAGIC = {(byte)0x1F, (byte)0x8B};
+ private static byte[] MAGIC = { (byte) 0x1F, (byte) 0x8B };
/**
- * <p>Returns true if the byte array passed matches the gzip header magic
- * number.</p>
+ * <p>
+ * Returns true if the byte array passed matches the gzip header magic number.
+ * </p>
*
- * @param input The byte array to check.
+ * @param input
+ * The byte array to check.
*
* @return True if the byte array matches the gzip header magic number.
*/
public static boolean isMagic(byte[] input) {
- // check for null and incorrect length
+ // check for null and incorrect length
if (input == null || input.length != MAGIC.length) {
return false;
}
-
+
// check byte by byte
for (int i = 0; i < MAGIC.length; i++) {
if (MAGIC[i] != input[i]) {
return false;
}
}
-
+
// must match
return true;
}
@@ -91,13 +98,16 @@ public class ArcRecordReader
/**
* Constructor that sets the configuration and file split.
*
- * @param conf The job configuration.
- * @param split The file split to read from.
+ * @param conf
+ * The job configuration.
+ * @param split
+ * The file split to read from.
*
- * @throws IOException If an IO error occurs while initializing file split.
+ * @throws IOException
+ * If an IO error occurs while initializing file split.
*/
public ArcRecordReader(Configuration conf, FileSplit split)
- throws IOException {
+ throws IOException {
Path path = split.getPath();
FileSystem fs = path.getFileSystem(conf);
@@ -113,8 +123,7 @@ public class ArcRecordReader
/**
* Closes the record reader resources.
*/
- public void close()
- throws IOException {
+ public void close() throws IOException {
this.in.close();
}
@@ -137,63 +146,64 @@ public class ArcRecordReader
*
* @return The long of the current position in the file.
*/
- public long getPos()
- throws IOException {
+ public long getPos() throws IOException {
return in.getPos();
}
/**
- * Returns the percentage of progress in processing the file. This will be
+ * Returns the percentage of progress in processing the file. This will be
* represented as a float from 0 to 1 with 1 being 100% completed.
*
* @return The percentage of progress as a float from 0 to 1.
*/
- public float getProgress()
- throws IOException {
-
+ public float getProgress() throws IOException {
+
// if we haven't even started
if (splitEnd == splitStart) {
return 0.0f;
- }
- else {
- // the progress is current pos - where we started / length of the split
- return Math.min(1.0f, (getPos() - splitStart) / (float)splitLen);
+ } else {
+ // the progress is current pos - where we started / length of the split
+ return Math.min(1.0f, (getPos() - splitStart) / (float) splitLen);
}
}
/**
- * <p>Returns true if the next record in the split is read into the key and
- * value pair. The key will be the arc record header and the values will be
- * the raw content bytes of the arc record.</p>
+ * <p>
+ * Returns true if the next record in the split is read into the key and value
+ * pair. The key will be the arc record header and the values will be the raw
+ * content bytes of the arc record.
+ * </p>
*
- * @param key The record key
- * @param value The record value
+ * @param key
+ * The record key
+ * @param value
+ * The record value
*
* @return True if the next record is read.
*
- * @throws IOException If an error occurs while reading the record value.
+ * @throws IOException
+ * If an error occurs while reading the record value.
*/
- public boolean next(Text key, BytesWritable value)
- throws IOException {
+ public boolean next(Text key, BytesWritable value) throws IOException {
try {
-
+
// get the starting position on the input stream
long startRead = in.getPos();
byte[] magicBuffer = null;
-
+
// we need this loop to handle false positives in reading of gzip records
while (true) {
-
+
// while we haven't passed the end of the split
if (startRead >= splitEnd) {
return false;
}
-
+
// scanning for the gzip header
boolean foundStart = false;
while (!foundStart) {
-
+
// start at the current file position and scan for 1K at time, break
// if there is no more to read
startRead = in.getPos();
@@ -202,13 +212,13 @@ public class ArcRecordReader
if (read < 0) {
break;
}
-
- // scan the byte array for the gzip header magic number. This happens
+
+ // scan the byte array for the gzip header magic number. This happens
// byte by byte
for (int i = 0; i < read - 1; i++) {
byte[] testMagic = new byte[2];
- System.arraycopy(magicBuffer, i, testMagic, 0, 2);
- if (isMagic(testMagic)) {
+ System.arraycopy(magicBuffer, i, testMagic, 0, 2);
+ if (isMagic(testMagic)) {
// set the next start to the current gzip header
startRead += i;
foundStart = true;
@@ -216,14 +226,14 @@ public class ArcRecordReader
}
}
}
-
+
// seek to the start of the gzip header
in.seek(startRead);
ByteArrayOutputStream baos = null;
int totalRead = 0;
try {
-
+
// read 4K of the gzip at a time putting into a byte array
byte[] buffer = new byte[4096];
GZIPInputStream zin = new GZIPInputStream(in);
@@ -233,9 +243,8 @@ public class ArcRecordReader
baos.write(buffer, 0, gzipRead);
totalRead += gzipRead;
}
- }
- catch (Exception e) {
-
+ } catch (Exception e) {
+
// there are times we get false positives where the gzip header exists
// but it is not an actual gzip record, so we ignore it and start
// over seeking
@@ -248,7 +257,7 @@ public class ArcRecordReader
// change the output stream to a byte array
byte[] content = baos.toByteArray();
-
+
// the first line of the raw content in arc files is the header
int eol = 0;
for (int i = 0; i < content.length; i++) {
@@ -257,34 +266,33 @@ public class ArcRecordReader
break;
}
}
-
+
// create the header and the raw content minus the header
String header = new String(content, 0, eol).trim();
byte[] raw = new byte[(content.length - eol) - 1];
System.arraycopy(content, eol + 1, raw, 0, raw.length);
-
+
// populate key and values with the header and raw content.
Text keyText = key;
keyText.set(header);
BytesWritable valueBytes = value;
valueBytes.set(raw, 0, raw.length);
- // TODO: It would be best to start at the end of the gzip read but
- // the bytes read in gzip don't match raw bytes in the file so we
- // overshoot the next header. With this current method you get
+ // TODO: It would be best to start at the end of the gzip read but
+ // the bytes read in gzip don't match raw bytes in the file so we
+ // overshoot the next header. With this current method you get
// some false positives but don't miss records.
if (startRead + 1 < fileLen) {
in.seek(startRead + 1);
}
-
+
// populated the record, now return
return true;
}
+ } catch (Exception e) {
+ LOG.equals(StringUtils.stringifyException(e));
}
- catch (Exception e) {
- LOG.equals(StringUtils.stringifyException(e));
- }
-
+
// couldn't populate the record or there is no next record to read
return false;
}
Modified: nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java Thu Jan 29 05:38:59 2015
@@ -61,18 +61,22 @@ import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.TimingUtil;
/**
- * <p>The <code>ArcSegmentCreator</code> is a replacement for fetcher that will
- * take arc files as input and produce a nutch segment as output.</p>
+ * <p>
+ * The <code>ArcSegmentCreator</code> is a replacement for fetcher that will
+ * take arc files as input and produce a nutch segment as output.
+ * </p>
*
- * <p>Arc files are tars of compressed gzips which are produced by both the
- * internet archive project and the grub distributed crawler project.</p>
+ * <p>
+ * Arc files are tars of compressed gzips which are produced by both the
+ * internet archive project and the grub distributed crawler project.
+ * </p>
*
*/
-public class ArcSegmentCreator
- extends Configured
- implements Tool, Mapper<Text, BytesWritable, Text, NutchWritable> {
+public class ArcSegmentCreator extends Configured implements Tool,
+ Mapper<Text, BytesWritable, Text, NutchWritable> {
- public static final Logger LOG = LoggerFactory.getLogger(ArcSegmentCreator.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(ArcSegmentCreator.class);
public static final String URL_VERSION = "arc.url.version";
private JobConf jobConf;
private URLFilters urlFilters;
@@ -88,7 +92,9 @@ public class ArcSegmentCreator
}
/**
- * <p>Constructor that sets the job configuration.</p>
+ * <p>
+ * Constructor that sets the job configuration.
+ * </p>
*
* @param conf
*/
@@ -104,17 +110,19 @@ public class ArcSegmentCreator
public static synchronized String generateSegmentName() {
try {
Thread.sleep(1000);
- }
- catch (Throwable t) {
+ } catch (Throwable t) {
}
return sdf.format(new Date(System.currentTimeMillis()));
}
/**
- * <p>Configures the job. Sets the url filters, scoring filters, url normalizers
- * and other relevant data.</p>
+ * <p>
+ * Configures the job. Sets the url filters, scoring filters, url normalizers
+ * and other relevant data.
+ * </p>
*
- * @param job The job configuration.
+ * @param job
+ * The job configuration.
*/
public void configure(JobConf job) {
@@ -132,23 +140,31 @@ public class ArcSegmentCreator
}
/**
- * <p>Parses the raw content of a single record to create output. This method
- * is almost the same as the {@link org.apache.nutch.Fetcher#output} method in
- * terms of processing and output.
+ * <p>
+ * Parses the raw content of a single record to create output. This method is
+ * almost the same as the {@link org.apache.nutch.Fetcher#output} method in
+ * terms of processing and output.
*
- * @param output The job output collector.
- * @param segmentName The name of the segment to create.
- * @param key The url of the record.
- * @param datum The CrawlDatum of the record.
- * @param content The raw content of the record
- * @param pstatus The protocol status
- * @param status The fetch status.
+ * @param output
+ * The job output collector.
+ * @param segmentName
+ * The name of the segment to create.
+ * @param key
+ * The url of the record.
+ * @param datum
+ * The CrawlDatum of the record.
+ * @param content
+ * The raw content of the record
+ * @param pstatus
+ * The protocol status
+ * @param status
+ * The fetch status.
*
* @return The result of the parse in a ParseStatus object.
*/
- private ParseStatus output(OutputCollector<Text, NutchWritable> output, String segmentName,
- Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus,
- int status) {
+ private ParseStatus output(OutputCollector<Text, NutchWritable> output,
+ String segmentName, Text key, CrawlDatum datum, Content content,
+ ProtocolStatus pstatus, int status) {
// set the fetch status and the fetch time
datum.setStatus(status);
@@ -164,8 +180,7 @@ public class ArcSegmentCreator
// add score to content metadata so that ParseSegment can pick it up.
try {
scfilters.passScoreBeforeParsing(key, datum, content);
- }
- catch (Exception e) {
+ } catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
}
@@ -175,16 +190,15 @@ public class ArcSegmentCreator
// parse the content
parseResult = this.parseUtil.parse(content);
- }
- catch (Exception e) {
+ } catch (Exception e) {
LOG.warn("Error parsing: " + key + ": "
- + StringUtils.stringifyException(e));
+ + StringUtils.stringifyException(e));
}
// set the content signature
if (parseResult == null) {
byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
- content, new ParseStatus().getEmptyParse(getConf()));
+ content, new ParseStatus().getEmptyParse(getConf()));
datum.setSignature(signature);
}
@@ -193,7 +207,7 @@ public class ArcSegmentCreator
output.collect(key, new NutchWritable(content));
if (parseResult != null) {
- for (Entry <Text, Parse> entry : parseResult) {
+ for (Entry<Text, Parse> entry : parseResult) {
Text url = entry.getKey();
Parse parse = entry.getValue();
ParseStatus parseStatus = parse.getData().getStatus();
@@ -203,35 +217,34 @@ public class ArcSegmentCreator
parse = parseStatus.getEmptyParse(getConf());
}
- // Calculate page signature.
- byte[] signature = SignatureFactory.getSignature(getConf()).calculate(
- content, parse);
+ // Calculate page signature.
+ byte[] signature = SignatureFactory.getSignature(getConf())
+ .calculate(content, parse);
// Ensure segment name and score are in parseData metadata
- parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
- segmentName);
- parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
- StringUtil.toHexString(signature));
+ parse.getData().getContentMeta()
+ .set(Nutch.SEGMENT_NAME_KEY, segmentName);
+ parse.getData().getContentMeta()
+ .set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
// Pass fetch time to content meta
- parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY,
- Long.toString(datum.getFetchTime()));
+ parse.getData().getContentMeta()
+ .set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime()));
if (url.equals(key))
datum.setSignature(signature);
try {
scfilters.passScoreAfterParsing(url, content, parse);
- }
- catch (Exception e) {
+ } catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
}
}
output.collect(url, new NutchWritable(new ParseImpl(new ParseText(
- parse.getText()), parse.getData(), parse.isCanonical())));
+ parse.getText()), parse.getData(), parse.isCanonical())));
}
}
- }
- catch (IOException e) {
+ } catch (IOException e) {
if (LOG.isErrorEnabled()) {
- LOG.error("ArcSegmentCreator caught:" + StringUtils.stringifyException(e));
+ LOG.error("ArcSegmentCreator caught:"
+ + StringUtils.stringifyException(e));
}
}
@@ -243,42 +256,51 @@ public class ArcSegmentCreator
}
}
}
-
+
return null;
}
/**
- * <p>Logs any error that occurs during conversion.</p>
+ * <p>
+ * Logs any error that occurs during conversion.
+ * </p>
*
- * @param url The url we are parsing.
- * @param t The error that occured.
+ * @param url
+ * The url we are parsing.
+ * @param t
+ * The error that occured.
*/
private void logError(Text url, Throwable t) {
if (LOG.isInfoEnabled()) {
- LOG.info("Conversion of " + url + " failed with: " +
- StringUtils.stringifyException(t));
+ LOG.info("Conversion of " + url + " failed with: "
+ + StringUtils.stringifyException(t));
}
}
/**
- * <p>Runs the Map job to translate an arc record into output for Nutch
- * segments.</p>
+ * <p>
+ * Runs the Map job to translate an arc record into output for Nutch segments.
+ * </p>
*
- * @param key The arc record header.
- * @param bytes The arc record raw content bytes.
- * @param output The output collecter.
- * @param reporter The progress reporter.
+ * @param key
+ * The arc record header.
+ * @param bytes
+ * The arc record raw content bytes.
+ * @param output
+ * The output collecter.
+ * @param reporter
+ * The progress reporter.
*/
public void map(Text key, BytesWritable bytes,
- OutputCollector<Text, NutchWritable> output, Reporter reporter)
- throws IOException {
+ OutputCollector<Text, NutchWritable> output, Reporter reporter)
+ throws IOException {
String[] headers = key.toString().split("\\s+");
String urlStr = headers[0];
String version = headers[2];
String contentType = headers[3];
-
- // arcs start with a file description. for now we ignore this as it is not
+
+ // arcs start with a file description. for now we ignore this as it is not
// a content record
if (urlStr.startsWith("filedesc://")) {
LOG.info("Ignoring file header: " + urlStr);
@@ -286,18 +308,17 @@ public class ArcSegmentCreator
}
LOG.info("Processing: " + urlStr);
- // get the raw bytes from the arc file, create a new crawldatum
+ // get the raw bytes from the arc file, create a new crawldatum
Text url = new Text();
CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, interval,
- 1.0f);
+ 1.0f);
String segmentName = getConf().get(Nutch.SEGMENT_NAME_KEY);
// normalize and filter the urls
try {
urlStr = normalizers.normalize(urlStr, URLNormalizers.SCOPE_FETCHER);
urlStr = urlFilters.filter(urlStr); // filter the url
- }
- catch (Exception e) {
+ } catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Skipping " + url + ":" + e);
}
@@ -312,37 +333,41 @@ public class ArcSegmentCreator
// set the protocol status to success and the crawl status to success
// create the content from the normalized url and the raw bytes from
- // the arc file, TODO: currently this doesn't handle text of errors
+ // the arc file, TODO: currently this doesn't handle text of errors
// pages (i.e. 404, etc.). We assume we won't get those.
ProtocolStatus status = ProtocolStatus.STATUS_SUCCESS;
- Content content = new Content(urlStr, urlStr, bytes.getBytes(), contentType,
- new Metadata(), getConf());
-
+ Content content = new Content(urlStr, urlStr, bytes.getBytes(),
+ contentType, new Metadata(), getConf());
+
// set the url version into the metadata
content.getMetadata().set(URL_VERSION, version);
ParseStatus pstatus = null;
pstatus = output(output, segmentName, url, datum, content, status,
- CrawlDatum.STATUS_FETCH_SUCCESS);
+ CrawlDatum.STATUS_FETCH_SUCCESS);
reporter.progress();
- }
- catch (Throwable t) { // unexpected exception
+ } catch (Throwable t) { // unexpected exception
logError(url, t);
output(output, segmentName, url, datum, null, null,
- CrawlDatum.STATUS_FETCH_RETRY);
+ CrawlDatum.STATUS_FETCH_RETRY);
}
}
}
/**
- * <p>Creates the arc files to segments job.</p>
+ * <p>
+ * Creates the arc files to segments job.
+ * </p>
*
- * @param arcFiles The path to the directory holding the arc files
- * @param segmentsOutDir The output directory for writing the segments
+ * @param arcFiles
+ * The path to the directory holding the arc files
+ * @param segmentsOutDir
+ * The output directory for writing the segments
*
- * @throws IOException If an IO error occurs while running the job.
+ * @throws IOException
+ * If an IO error occurs while running the job.
*/
public void createSegments(Path arcFiles, Path segmentsOutDir)
- throws IOException {
+ throws IOException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
long start = System.currentTimeMillis();
@@ -366,17 +391,17 @@ public class ArcSegmentCreator
JobClient.runJob(job);
long end = System.currentTimeMillis();
- LOG.info("ArcSegmentCreator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
+ LOG.info("ArcSegmentCreator: finished at " + sdf.format(end)
+ + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}
- public static void main(String args[])
- throws Exception {
- int res = ToolRunner.run(NutchConfiguration.create(), new ArcSegmentCreator(), args);
+ public static void main(String args[]) throws Exception {
+ int res = ToolRunner.run(NutchConfiguration.create(),
+ new ArcSegmentCreator(), args);
System.exit(res);
}
- public int run(String[] args)
- throws Exception {
+ public int run(String[] args) throws Exception {
String usage = "Usage: ArcSegmentCreator <arcFiles> <segmentsOutDir>";
@@ -393,8 +418,7 @@ public class ArcSegmentCreator
// create the segments from the arc files
createSegments(arcFiles, segmentsOutDir);
return 0;
- }
- catch (Exception e) {
+ } catch (Exception e) {
LOG.error("ArcSegmentCreator: " + StringUtils.stringifyException(e));
return -1;
}
Modified: nutch/trunk/src/java/org/apache/nutch/tools/arc/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/arc/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/arc/package-info.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/arc/package-info.java Thu Jan 29 05:38:59 2015
@@ -20,3 +20,4 @@
* <a href="http://archive.org/web/researcher/ArcFileFormat.php">Arc file format</a>.
*/
package org.apache.nutch.tools.arc;
+
Modified: nutch/trunk/src/java/org/apache/nutch/tools/package-info.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/package-info.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/package-info.java Thu Jan 29 05:38:59 2015
@@ -19,3 +19,4 @@
* Miscellaneous tools.
*/
package org.apache.nutch.tools;
+
Modified: nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/CommandRunner.java Thu Jan 29 05:38:59 2015
@@ -82,11 +82,11 @@ public class CommandRunner {
}
public void evaluate() throws IOException {
- this.exec();
+ this.exec();
}
/**
- *
+ *
* @return process exit value (return code) or -1 if timed out.
* @throws IOException
*/
@@ -94,13 +94,11 @@ public class CommandRunner {
Process proc = Runtime.getRuntime().exec(_command);
_barrier = new CyclicBarrier(3 + ((_stdin != null) ? 1 : 0));
- PullerThread so =
- new PullerThread("STDOUT", proc.getInputStream(), _stdout);
+ PullerThread so = new PullerThread("STDOUT", proc.getInputStream(), _stdout);
so.setDaemon(true);
so.start();
- PullerThread se =
- new PullerThread("STDERR", proc.getErrorStream(), _stderr);
+ PullerThread se = new PullerThread("STDERR", proc.getErrorStream(), _stderr);
se.setDaemon(true);
se.start();
@@ -145,11 +143,11 @@ public class CommandRunner {
Thread.sleep(1000);
_xit = proc.exitValue();
} catch (InterruptedException ie) {
- if (Thread.interrupted()) {
- break; // stop waiting on an interrupt for this thread
- } else {
- continue;
- }
+ if (Thread.interrupted()) {
+ break; // stop waiting on an interrupt for this thread
+ } else {
+ continue;
+ }
} catch (IllegalThreadStateException iltse) {
continue;
}
@@ -181,11 +179,8 @@ public class CommandRunner {
private boolean _closeInput;
- protected PumperThread(
- String name,
- InputStream is,
- OutputStream os,
- boolean closeInput) {
+ protected PumperThread(String name, InputStream is, OutputStream os,
+ boolean closeInput) {
super(name);
_is = is;
_os = os;
@@ -218,12 +213,12 @@ public class CommandRunner {
}
}
try {
- _barrier.await();
- } catch (InterruptedException ie) {
- /* IGNORE */
- } catch (BrokenBarrierException bbe) {
- /* IGNORE */
- }
+ _barrier.await();
+ } catch (InterruptedException ie) {
+ /* IGNORE */
+ } catch (BrokenBarrierException bbe) {
+ /* IGNORE */
+ }
}
}
@@ -269,8 +264,9 @@ public class CommandRunner {
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-timeout")) {
- timeout = Integer.parseInt(args[++i]);;
- } else if (i != args.length-2) {
+ timeout = Integer.parseInt(args[++i]);
+ ;
+ } else if (i != args.length - 2) {
System.err.println(usage);
System.exit(-1);
} else {
@@ -290,6 +286,6 @@ public class CommandRunner {
cr.evaluate();
- System.err.println("output value: "+cr.getExitValue());
+ System.err.println("output value: " + cr.getExitValue());
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/DeflateUtils.java Thu Jan 29 05:38:59 2015
@@ -28,19 +28,18 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
- * A collection of utility methods for working on deflated data.
+ * A collection of utility methods for working on deflated data.
*/
public class DeflateUtils {
-
+
private static final Logger LOG = LoggerFactory.getLogger(DeflateUtils.class);
private static final int EXPECTED_COMPRESSION_RATIO = 5;
private static final int BUF_SIZE = 4096;
/**
- * Returns an inflated copy of the input array. If the deflated
- * input has been truncated or corrupted, a best-effort attempt is
- * made to inflate as much as possible. If no data can be extracted
- * <code>null</code> is returned.
+ * Returns an inflated copy of the input array. If the deflated input has been
+ * truncated or corrupted, a best-effort attempt is made to inflate as much as
+ * possible. If no data can be extracted <code>null</code> is returned.
*/
public static final byte[] inflateBestEffort(byte[] in) {
return inflateBestEffort(in, Integer.MAX_VALUE);
@@ -48,37 +47,36 @@ public class DeflateUtils {
/**
* Returns an inflated copy of the input array, truncated to
- * <code>sizeLimit</code> bytes, if necessary. If the deflated input
- * has been truncated or corrupted, a best-effort attempt is made to
- * inflate as much as possible. If no data can be extracted
- * <code>null</code> is returned.
+ * <code>sizeLimit</code> bytes, if necessary. If the deflated input has been
+ * truncated or corrupted, a best-effort attempt is made to inflate as much as
+ * possible. If no data can be extracted <code>null</code> is returned.
*/
public static final byte[] inflateBestEffort(byte[] in, int sizeLimit) {
- // decompress using InflaterInputStream
- ByteArrayOutputStream outStream =
- new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length);
+ // decompress using InflaterInputStream
+ ByteArrayOutputStream outStream = new ByteArrayOutputStream(
+ EXPECTED_COMPRESSION_RATIO * in.length);
// "true" because HTTP does not provide zlib headers
Inflater inflater = new Inflater(true);
- InflaterInputStream inStream =
- new InflaterInputStream(new ByteArrayInputStream(in), inflater);
+ InflaterInputStream inStream = new InflaterInputStream(
+ new ByteArrayInputStream(in), inflater);
byte[] buf = new byte[BUF_SIZE];
int written = 0;
while (true) {
try {
- int size = inStream.read(buf);
- if (size <= 0)
- break;
- if ((written + size) > sizeLimit) {
- outStream.write(buf, 0, sizeLimit - written);
- break;
- }
- outStream.write(buf, 0, size);
- written+= size;
+ int size = inStream.read(buf);
+ if (size <= 0)
+ break;
+ if ((written + size) > sizeLimit) {
+ outStream.write(buf, 0, sizeLimit - written);
+ break;
+ }
+ outStream.write(buf, 0, size);
+ written += size;
} catch (Exception e) {
- LOG.info( "Caught Exception in inflateBestEffort", e );
- break;
+ LOG.info("Caught Exception in inflateBestEffort", e);
+ break;
}
}
try {
@@ -89,23 +87,24 @@ public class DeflateUtils {
return outStream.toByteArray();
}
-
/**
- * Returns an inflated copy of the input array.
- * @throws IOException if the input cannot be properly decompressed
+ * Returns an inflated copy of the input array.
+ *
+ * @throws IOException
+ * if the input cannot be properly decompressed
*/
public static final byte[] inflate(byte[] in) throws IOException {
- // decompress using InflaterInputStream
- ByteArrayOutputStream outStream =
- new ByteArrayOutputStream(EXPECTED_COMPRESSION_RATIO * in.length);
+ // decompress using InflaterInputStream
+ ByteArrayOutputStream outStream = new ByteArrayOutputStream(
+ EXPECTED_COMPRESSION_RATIO * in.length);
- InflaterInputStream inStream =
- new InflaterInputStream ( new ByteArrayInputStream(in) );
+ InflaterInputStream inStream = new InflaterInputStream(
+ new ByteArrayInputStream(in));
byte[] buf = new byte[BUF_SIZE];
while (true) {
int size = inStream.read(buf);
- if (size <= 0)
+ if (size <= 0)
break;
outStream.write(buf, 0, size);
}
@@ -118,9 +117,9 @@ public class DeflateUtils {
* Returns a deflated copy of the input array.
*/
public static final byte[] deflate(byte[] in) {
- // compress using DeflaterOutputStream
- ByteArrayOutputStream byteOut =
- new ByteArrayOutputStream(in.length / EXPECTED_COMPRESSION_RATIO);
+ // compress using DeflaterOutputStream
+ ByteArrayOutputStream byteOut = new ByteArrayOutputStream(in.length
+ / EXPECTED_COMPRESSION_RATIO);
DeflaterOutputStream outStream = new DeflaterOutputStream(byteOut);
Modified: nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/DomUtil.java Thu Jan 29 05:38:59 2015
@@ -38,7 +38,6 @@ import org.xml.sax.SAXException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
public class DomUtil {
private final static Logger LOG = LoggerFactory.getLogger(DomUtil.class);
@@ -61,10 +60,10 @@ public class DomUtil {
input.setEncoding("UTF-8");
parser.parse(input);
int i = 0;
- while (! (parser.getDocument().getChildNodes().item(i) instanceof Element)) {
- i++;
- }
- element = (Element)parser.getDocument().getChildNodes().item(i);
+ while (!(parser.getDocument().getChildNodes().item(i) instanceof Element)) {
+ i++;
+ }
+ element = (Element) parser.getDocument().getChildNodes().item(i);
} catch (FileNotFoundException e) {
LOG.error("Error: ", e);
} catch (SAXException e) {
Modified: nutch/trunk/src/java/org/apache/nutch/util/EncodingDetector.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/EncodingDetector.java?rev=1655526&r1=1655525&r2=1655526&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/util/EncodingDetector.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/util/EncodingDetector.java Thu Jan 29 05:38:59 2015
@@ -39,27 +39,26 @@ import com.ibm.icu.text.CharsetMatch;
/**
* A simple class for detecting character encodings.
- *
+ *
* <p>
* Broadly this encompasses two functions, which are distinctly separate:
- *
+ *
* <ol>
- * <li>Auto detecting a set of "clues" from input text.</li>
- * <li>Taking a set of clues and making a "best guess" as to the
- * "real" encoding.</li>
+ * <li>Auto detecting a set of "clues" from input text.</li>
+ * <li>Taking a set of clues and making a "best guess" as to the "real"
+ * encoding.</li>
* </ol>
* </p>
- *
+ *
* <p>
- * A caller will often have some extra information about what the
- * encoding might be (e.g. from the HTTP header or HTML meta-tags, often
- * wrong but still potentially useful clues). The types of clues may differ
- * from caller to caller. Thus a typical calling sequence is:
+ * A caller will often have some extra information about what the encoding might
+ * be (e.g. from the HTTP header or HTML meta-tags, often wrong but still
+ * potentially useful clues). The types of clues may differ from caller to
+ * caller. Thus a typical calling sequence is:
* <ul>
- * <li>Run step (1) to generate a set of auto-detected clues;</li>
- * <li>Combine these clues with the caller-dependent "extra clues"
- * available;</li>
- * <li>Run step (2) to guess what the most probable answer is.</li>
+ * <li>Run step (1) to generate a set of auto-detected clues;</li>
+ * <li>Combine these clues with the caller-dependent "extra clues" available;</li>
+ * <li>Run step (2) to guess what the most probable answer is.</li>
* </p>
*/
public class EncodingDetector {
@@ -89,34 +88,32 @@ public class EncodingDetector {
}
public String toString() {
- return value + " (" + source +
- ((confidence >= 0) ? ", " + confidence + "% confidence" : "") + ")";
+ return value + " (" + source
+ + ((confidence >= 0) ? ", " + confidence + "% confidence" : "") + ")";
}
public boolean isEmpty() {
- return (value==null || "".equals(value));
+ return (value == null || "".equals(value));
}
public boolean meetsThreshold() {
- return (confidence < 0 ||
- (minConfidence >= 0 && confidence >= minConfidence));
+ return (confidence < 0 || (minConfidence >= 0 && confidence >= minConfidence));
}
}
- public static final Logger LOG = LoggerFactory.getLogger(EncodingDetector.class);
+ public static final Logger LOG = LoggerFactory
+ .getLogger(EncodingDetector.class);
public static final int NO_THRESHOLD = -1;
- public static final String MIN_CONFIDENCE_KEY =
- "encodingdetector.charset.min.confidence";
+ public static final String MIN_CONFIDENCE_KEY = "encodingdetector.charset.min.confidence";
- private static final HashMap<String, String> ALIASES =
- new HashMap<String, String>();
+ private static final HashMap<String, String> ALIASES = new HashMap<String, String>();
private static final HashSet<String> DETECTABLES = new HashSet<String>();
// CharsetDetector will die without a minimum amount of data.
- private static final int MIN_LENGTH=4;
+ private static final int MIN_LENGTH = 4;
static {
DETECTABLES.add("text/html");
@@ -129,23 +126,22 @@ public class EncodingDetector {
DETECTABLES.add("application/rss+xml");
DETECTABLES.add("application/xhtml+xml");
/*
- * the following map is not an alias mapping table, but
- * maps character encodings which are often used in mislabelled
- * documents to their correct encodings. For instance,
- * there are a lot of documents labelled 'ISO-8859-1' which contain
- * characters not covered by ISO-8859-1 but covered by windows-1252.
- * Because windows-1252 is a superset of ISO-8859-1 (sharing code points
- * for the common part), it's better to treat ISO-8859-1 as
- * synonymous with windows-1252 than to reject, as invalid, documents
- * labelled as ISO-8859-1 that have characters outside ISO-8859-1.
+ * the following map is not an alias mapping table, but maps character
+ * encodings which are often used in mislabelled documents to their correct
+ * encodings. For instance, there are a lot of documents labelled
+ * 'ISO-8859-1' which contain characters not covered by ISO-8859-1 but
+ * covered by windows-1252. Because windows-1252 is a superset of ISO-8859-1
+ * (sharing code points for the common part), it's better to treat
+ * ISO-8859-1 as synonymous with windows-1252 than to reject, as invalid,
+ * documents labelled as ISO-8859-1 that have characters outside ISO-8859-1.
*/
ALIASES.put("ISO-8859-1", "windows-1252");
ALIASES.put("EUC-KR", "x-windows-949");
ALIASES.put("x-EUC-CN", "GB18030");
ALIASES.put("GBK", "GB18030");
- //ALIASES.put("Big5", "Big5HKSCS");
- //ALIASES.put("TIS620", "Cp874");
- //ALIASES.put("ISO-8859-11", "Cp874");
+ // ALIASES.put("Big5", "Big5HKSCS");
+ // ALIASES.put("TIS620", "Cp874");
+ // ALIASES.put("ISO-8859-11", "Cp874");
}
@@ -188,8 +184,9 @@ public class EncodingDetector {
}
// add character encoding coming from HTTP response header
- addClue(parseCharacterEncoding(
- content.getMetadata().get(Response.CONTENT_TYPE)), "header");
+ addClue(
+ parseCharacterEncoding(content.getMetadata().get(Response.CONTENT_TYPE)),
+ "header");
}
public void addClue(String value, String source, int confidence) {
@@ -208,21 +205,23 @@ public class EncodingDetector {
/**
* Guess the encoding with the previously specified list of clues.
- *
- * @param content Content instance
- * @param defaultValue Default encoding to return if no encoding can be
- * detected with enough confidence. Note that this will <b>not</b> be
- * normalized with {@link EncodingDetector#resolveEncodingAlias}
- *
+ *
+ * @param content
+ * Content instance
+ * @param defaultValue
+ * Default encoding to return if no encoding can be detected with
+ * enough confidence. Note that this will <b>not</b> be normalized
+ * with {@link EncodingDetector#resolveEncodingAlias}
+ *
* @return Guessed encoding or defaultValue
*/
public String guessEncoding(Content content, String defaultValue) {
/*
- * This algorithm could be replaced by something more sophisticated;
- * ideally we would gather a bunch of data on where various clues
- * (autodetect, HTTP headers, HTML meta tags, etc.) disagree, tag each with
- * the correct answer, and use machine learning/some statistical method
- * to generate a better heuristic.
+ * This algorithm could be replaced by something more sophisticated; ideally
+ * we would gather a bunch of data on where various clues (autodetect, HTTP
+ * headers, HTML meta tags, etc.) disagree, tag each with the correct
+ * answer, and use machine learning/some statistical method to generate a
+ * better heuristic.
*/
String base = content.getBaseUrl();
@@ -232,10 +231,9 @@ public class EncodingDetector {
}
/*
- * Go down the list of encoding "clues". Use a clue if:
- * 1. Has a confidence value which meets our confidence threshold, OR
- * 2. Doesn't meet the threshold, but is the best try,
- * since nothing else is available.
+ * Go down the list of encoding "clues". Use a clue if: 1. Has a confidence
+ * value which meets our confidence threshold, OR 2. Doesn't meet the
+ * threshold, but is the best try, since nothing else is available.
*/
EncodingClue defaultClue = new EncodingClue(defaultValue, "default");
EncodingClue bestClue = defaultClue;
@@ -247,8 +245,8 @@ public class EncodingDetector {
String charset = clue.value;
if (minConfidence >= 0 && clue.confidence >= minConfidence) {
if (LOG.isTraceEnabled()) {
- LOG.trace(base + ": Choosing encoding: " + charset +
- " with confidence " + clue.confidence);
+ LOG.trace(base + ": Choosing encoding: " + charset
+ + " with confidence " + clue.confidence);
}
return resolveEncodingAlias(charset).toLowerCase();
} else if (clue.confidence == NO_THRESHOLD && bestClue == defaultClue) {
@@ -268,10 +266,10 @@ public class EncodingDetector {
}
/*
- * Strictly for analysis, look for "disagreements." The top guess from
- * each source is examined; if these meet the threshold and disagree, then
- * we log the information -- useful for testing or generating training data
- * for a better heuristic.
+ * Strictly for analysis, look for "disagreements." The top guess from each
+ * source is examined; if these meet the threshold and disagree, then we log
+ * the information -- useful for testing or generating training data for a
+ * better heuristic.
*/
private void findDisagreements(String url, List<EncodingClue> newClues) {
HashSet<String> valsSeen = new HashSet<String>();
@@ -293,9 +291,9 @@ public class EncodingDetector {
if (disagreement) {
// dump all values in case of disagreement
StringBuffer sb = new StringBuffer();
- sb.append("Disagreement: "+url+"; ");
+ sb.append("Disagreement: " + url + "; ");
for (int i = 0; i < newClues.size(); i++) {
- if (i>0) {
+ if (i > 0) {
sb.append(", ");
}
sb.append(newClues.get(i));
@@ -310,7 +308,7 @@ public class EncodingDetector {
return null;
String canonicalName = new String(Charset.forName(encoding).name());
return ALIASES.containsKey(canonicalName) ? ALIASES.get(canonicalName)
- : canonicalName;
+ : canonicalName;
} catch (Exception e) {
LOG.warn("Invalid encoding " + encoding + " detected, using default.");
return null;
@@ -318,14 +316,14 @@ public class EncodingDetector {
}
/**
- * Parse the character encoding from the specified content type header.
- * If the content type is null, or there is no explicit character encoding,
- * <code>null</code> is returned.
- * <br />
- * This method was copied from org.apache.catalina.util.RequestUtil,
- * which is licensed under the Apache License, Version 2.0 (the "License").
- *
- * @param contentType a content type header
+ * Parse the character encoding from the specified content type header. If the
+ * content type is null, or there is no explicit character encoding,
+ * <code>null</code> is returned. <br />
+ * This method was copied from org.apache.catalina.util.RequestUtil, which is
+ * licensed under the Apache License, Version 2.0 (the "License").
+ *
+ * @param contentType
+ * a content type header
*/
public static String parseCharacterEncoding(String contentType) {
if (contentType == null)
@@ -339,7 +337,7 @@ public class EncodingDetector {
encoding = encoding.substring(0, end);
encoding = encoding.trim();
if ((encoding.length() > 2) && (encoding.startsWith("\""))
- && (encoding.endsWith("\"")))
+ && (encoding.endsWith("\"")))
encoding = encoding.substring(1, encoding.length() - 1);
return (encoding.trim());
@@ -352,12 +350,12 @@ public class EncodingDetector {
}
Configuration conf = NutchConfiguration.create();
- EncodingDetector detector =
- new EncodingDetector(NutchConfiguration.create());
+ EncodingDetector detector = new EncodingDetector(
+ NutchConfiguration.create());
// do everything as bytes; don't want any conversion
- BufferedInputStream istr =
- new BufferedInputStream(new FileInputStream(args[0]));
+ BufferedInputStream istr = new BufferedInputStream(new FileInputStream(
+ args[0]));
ByteArrayOutputStream ostr = new ByteArrayOutputStream();
byte[] bytes = new byte[1000];
boolean more = true;
@@ -376,8 +374,8 @@ public class EncodingDetector {
byte[] data = ostr.toByteArray();
// make a fake Content
- Content content =
- new Content("", "", data, "text/html", new Metadata(), conf);
+ Content content = new Content("", "", data, "text/html", new Metadata(),
+ conf);
detector.autoDetectClues(content, true);
String encoding = detector.guessEncoding(content,