You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2006/11/13 20:46:57 UTC
svn commit: r474464 - in /lucene/nutch/trunk: ./
src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/protocol/
src/test/org/apache/nutch/metadata/ src/test/org/apache/nutch/protocol/
Author: siren
Date: Mon Nov 13 11:46:56 2006
New Revision: 474464
URL: http://svn.apache.org/viewvc?view=rev&rev=474464
Log:
NUTCH-395 Increase fetching speed
Added:
lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java
lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=474464&r1=474463&r2=474464
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Mon Nov 13 11:46:56 2006
@@ -71,6 +71,7 @@
22. NUTCH-399 - Change CommandRunner to use concurrent api from jdk (siren)
+23. NUTCH-395 - Increase fetching speed (siren)
Release 0.8 - 2006-07-25
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java?view=diff&rev=474464&r1=474463&r2=474464
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java Mon Nov 13 11:46:56 2006
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -16,103 +16,58 @@
*/
package org.apache.nutch.metadata;
-// JDK imports
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
-import java.lang.reflect.Field;
-import java.lang.reflect.Modifier;
-import java.text.SimpleDateFormat;
-import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
import java.util.Map;
import java.util.Properties;
-
-// Commons Lang imports
-import org.apache.commons.lang.StringUtils;
-
-// Hadoop imports
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
/**
- * A syntax tolerant and multi-valued metadata container.
- *
- * All the static String fields declared by this class are used as reference
- * names for syntax correction on meta-data naming.
+ * A multi-valued metadata container.
*
* @author Chris Mattmann
* @author Jérôme Charron
+ *
*/
-public class Metadata implements CreativeCommons,
- DublinCore,
- HttpHeaders,
- Nutch,
- Office,
- Writable {
-
-
- /** Used to format DC dates for the DATE metadata field */
- public final static SimpleDateFormat DATE_FORMAT =
- new SimpleDateFormat("yyyy-MM-dd");
-
-
- private final static Map NAMES_IDX = new HashMap();
- private static String[] normalized = null;
-
- // Uses self introspection to fill the metanames index and the
- // metanames list.
- static {
- Field[] fields = Metadata.class.getFields();
- for (int i=0; i<fields.length; i++) {
- int mods = fields[i].getModifiers();
- if (Modifier.isFinal(mods) &&
- Modifier.isPublic(mods) &&
- Modifier.isStatic(mods) &&
- fields[i].getType().equals(String.class)) {
- try {
- String val = (String) fields[i].get(null);
- NAMES_IDX.put(normalize(val), val);
- } catch (Exception e) {
- // Simply ignore...
- }
- }
- }
- normalized = (String[]) NAMES_IDX.keySet().toArray(new String[NAMES_IDX.size()]);
- }
-
-
- /** A map of all metadata attributes */
- private Map metadata = null;
+public class Metadata implements Writable, CreativeCommons,
+DublinCore, HttpHeaders, Nutch, Office {
+
+ /**
+ * A map of all metadata attributes.
+ */
+ private Map<String, String[]> metadata = null;
+
-
- /** Constructs a new, empty metadata. */
+ /**
+ * Constructs a new, empty metadata.
+ */
public Metadata() {
- metadata = new HashMap();
+ metadata = new HashMap<String, String[]>();
}
/**
+ * Returns true if named value is multivalued.
+ * @param name name of metadata
+ * @return true is named value is multivalued, false if single
+ * value or null
*/
- public boolean isMultiValued(String name) {
- return getValues(name).length > 1;
+ public boolean isMultiValued(final String name) {
+ return metadata.get(name) != null && metadata.get(name).length > 1;
}
/**
* Returns an array of the names contained in the metadata.
+ * @return Metadata names
*/
public String[] names() {
- Iterator iter = metadata.keySet().iterator();
- List names = new ArrayList();
- while(iter.hasNext()) {
- names.add(getNormalizedName((String) iter.next()));
- }
- return (String[]) names.toArray(new String[names.size()]);
+ return metadata.keySet().toArray(new String[metadata.keySet().size()]);
}
-
+
/**
* Get the value associated to a metadata name.
* If many values are assiociated to the specified name, then the first
@@ -121,12 +76,12 @@
* @param name of the metadata.
* @return the value associated to the specified metadata name.
*/
- public String get(String name) {
- Object values = metadata.get(getNormalizedName(name));
- if ((values != null) && (values instanceof List)) {
- return (String) ((List) values).get(0);
+ public String get(final String name) {
+ String[] values = metadata.get(name);
+ if (values == null) {
+ return null;
} else {
- return (String) values;
+ return values[0];
}
}
@@ -135,19 +90,14 @@
* @param name of the metadata.
* @return the values associated to a metadata name.
*/
- public String[] getValues(String name) {
- Object values = metadata.get(getNormalizedName(name));
- if (values != null) {
- if (values instanceof List) {
- List list = (List) values;
- return (String[]) list.toArray(new String[list.size()]);
- } else {
- return new String[] { (String) values };
- }
+ public String[] getValues(final String name) {
+ String[] values = metadata.get(name);
+ if (values == null) {
+ values = new String[0];
}
- return new String[0];
+ return values;
}
-
+
/**
* Add a metadata name/value mapping.
* Add the specified value to the list of values associated to the
@@ -156,31 +106,30 @@
* @param name the metadata name.
* @param value the metadata value.
*/
- public void add(String name, String value) {
- String normalized = getNormalizedName(name);
- Object values = metadata.get(normalized);
- if (values != null) {
- if (values instanceof String) {
- List list = new ArrayList();
- list.add(values);
- list.add(value);
- metadata.put(normalized, list);
- } else if (values instanceof List) {
- ((List) values).add(value);
- }
+ public void add(final String name, final String value) {
+ String[] values = metadata.get(name);
+ if (values == null) {
+ set(name, value);
} else {
- metadata.put(normalized, value);
+ String[] newValues = new String[values.length + 1];
+ System.arraycopy(values, 0, newValues, 0, values.length);
+ newValues[newValues.length - 1] = value;
+ metadata.put(name, newValues);
}
}
+ /**
+ * Copy All key-value pairs from properties.
+ * @param properties properties to copy from
+ */
public void setAll(Properties properties) {
Enumeration names = properties.propertyNames();
while (names.hasMoreElements()) {
String name = (String) names.nextElement();
- set(name, properties.getProperty(name));
+ metadata.put(name, new String[]{properties.getProperty(name)});
}
}
-
+
/**
* Set metadata name/value.
* Associate the specified value to the specified metadata name. If some
@@ -190,46 +139,46 @@
* @param value the metadata value.
*/
public void set(String name, String value) {
- remove(name);
- add(name, value);
+ metadata.put(name, new String[]{value});
}
/**
* Remove a metadata and all its associated values.
+ * @param name metadata name to remove
*/
public void remove(String name) {
- metadata.remove(getNormalizedName(name));
+ metadata.remove(name);
}
-
+
/**
* Returns the number of metadata names in this metadata.
+ * @return number of metadata names
*/
public int size() {
return metadata.size();
}
-
- // Inherited Javadoc
+
public boolean equals(Object o) {
-
+
if (o == null) { return false; }
-
+
Metadata other = null;
try {
other = (Metadata) o;
} catch (ClassCastException cce) {
return false;
}
-
+
if (other.size() != size()) { return false; }
-
+
String[] names = names();
- for (int i=0; i<names.length; i++) {
+ for (int i = 0; i < names.length; i++) {
String[] otherValues = other.getValues(names[i]);
String[] thisValues = getValues(names[i]);
if (otherValues.length != thisValues.length) {
return false;
}
- for (int j=0; j<otherValues.length; j++) {
+ for (int j = 0; j < otherValues.length; j++) {
if (!otherValues[j].equals(thisValues[j])) {
return false;
}
@@ -238,13 +187,12 @@
return true;
}
- // Inherited Javadoc
public String toString() {
StringBuffer buf = new StringBuffer();
String[] names = names();
- for (int i=0; i<names.length; i++) {
+ for (int i = 0; i < names.length; i++) {
String[] values = getValues(names[i]);
- for (int j=0; j<values.length; j++) {
+ for (int j = 0; j < values.length; j++) {
buf.append(names[i])
.append("=")
.append(values[j])
@@ -253,83 +201,31 @@
}
return buf.toString();
}
-
-
- /**
- * Get the normalized name of metadata attribute name.
- * This method tries to find a well-known metadata name (one of the
- * metadata names defined in this class) that matches the specified name.
- * The matching is error tolerent. For instance,
- * <ul>
- * <li>content-type gives Content-Type</li>
- * <li>CoNtEntType gives Content-Type</li>
- * <li>ConTnTtYpe gives Content-Type</li>
- * </ul>
- * If no matching with a well-known metadata name is found, then the original
- * name is returned.
- */
- public static String getNormalizedName(String name) {
- String searched = normalize(name);
- String value = (String) NAMES_IDX.get(searched);
-
- if ((value == null) && (normalized != null)) {
- int threshold = searched.length() / 3;
- for (int i=0; i<normalized.length && value == null; i++) {
- if (StringUtils.getLevenshteinDistance(searched, normalized[i]) < threshold) {
- value = (String) NAMES_IDX.get(normalized[i]);
- }
- }
- }
- return (value != null) ? value : name;
- }
-
- private final static String normalize(String str) {
- char c;
- StringBuffer buf = new StringBuffer();
- for (int i=0; i<str.length(); i++) {
- c = str.charAt(i);
- if (Character.isLetter(c)) {
- buf.append(Character.toLowerCase(c));
- }
- }
- return buf.toString();
- }
-
- /* ------------------------- *
- * <implementation:Writable> *
- * ------------------------- */
-
- // Inherited Javadoc
public final void write(DataOutput out) throws IOException {
out.writeInt(size());
String[] values = null;
String[] names = names();
- for (int i=0; i<names.length; i++) {
+ for (int i = 0; i < names.length; i++) {
Text.writeString(out, names[i]);
values = getValues(names[i]);
out.writeInt(values.length);
- for (int j=0; j<values.length; j++) {
+ for (int j = 0; j < values.length; j++) {
Text.writeString(out, values[j]);
}
}
}
- // Inherited Javadoc
public final void readFields(DataInput in) throws IOException {
int keySize = in.readInt();
String key;
- for (int i=0; i<keySize; i++) {
+ for (int i = 0; i < keySize; i++) {
key = Text.readString(in);
int valueSize = in.readInt();
- for (int j=0; j<valueSize; j++) {
+ for (int j = 0; j < valueSize; j++) {
add(key, Text.readString(in));
}
}
}
- /* -------------------------- *
- * </implementation:Writable> *
- * -------------------------- */
-
}
Added: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java?view=auto&rev=474464
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java Mon Nov 13 11:46:56 2006
@@ -0,0 +1,147 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import java.lang.reflect.Field;
+import java.lang.reflect.Modifier;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.lang.StringUtils;
+
+/**
+ * A decorator to Metadata that adds spellchecking capabilities to property
+ * names.
+ *
+ * All the static String fields declared by this class are used as reference
+ * names for syntax correction on meta-data naming.
+ */
+public class SpellCheckedMetadata extends Metadata {
+
+ /**
+ * Treshold divider.
+ *
+ * <code>threshold = searched.length() / TRESHOLD_DIVIDER;</code>
+ */
+ private static final int TRESHOLD_DIVIDER = 3;
+
+ /**
+ * Normalized name to name mapping.
+ */
+ private final static Map<String, String> NAMES_IDX = new HashMap<String, String>();
+
+ /**
+ * Array holding map keys.
+ */
+ private static String[] normalized = null;
+
+ // Uses self introspection to fill the metanames index and the
+ // metanames list.
+ static {
+ for (Field field : SpellCheckedMetadata.class.getFields()) {
+ int mods = field.getModifiers();
+ if (Modifier.isFinal(mods) && Modifier.isPublic(mods)
+ && Modifier.isStatic(mods) && field.getType().equals(String.class)) {
+ try {
+ String val = (String) field.get(null);
+ NAMES_IDX.put(normalize(val), val);
+ } catch (Exception e) {
+ // Simply ignore...
+ }
+ }
+ }
+ normalized = NAMES_IDX.keySet().toArray(new String[NAMES_IDX.size()]);
+ }
+
+ /**
+ * Normalizes String.
+ *
+ * @param str
+ * the string to normalize
+ * @return normalized String
+ */
+ private static String normalize(final String str) {
+ char c;
+ StringBuffer buf = new StringBuffer();
+ for (int i = 0; i < str.length(); i++) {
+ c = str.charAt(i);
+ if (Character.isLetter(c)) {
+ buf.append(Character.toLowerCase(c));
+ }
+ }
+ return buf.toString();
+ }
+
+ /**
+ * Get the normalized name of metadata attribute name. This method tries to
+ * find a well-known metadata name (one of the metadata names defined in this
+ * class) that matches the specified name. The matching is error tolerent. For
+ * instance,
+ * <ul>
+ * <li>content-type gives Content-Type</li>
+ * <li>CoNtEntType gives Content-Type</li>
+ * <li>ConTnTtYpe gives Content-Type</li>
+ * </ul>
+ * If no matching with a well-known metadata name is found, then the original
+ * name is returned.
+ *
+ * @param name
+ * Name to normalize
+ * @return normalized name
+ */
+ public static String getNormalizedName(final String name) {
+ String searched = normalize(name);
+ String value = NAMES_IDX.get(searched);
+
+ if ((value == null) && (normalized != null)) {
+ int threshold = searched.length() / TRESHOLD_DIVIDER;
+ for (int i = 0; i < normalized.length && value == null; i++) {
+ if (StringUtils.getLevenshteinDistance(searched, normalized[i]) < threshold) {
+ value = NAMES_IDX.get(normalized[i]);
+ }
+ }
+ }
+ return (value != null) ? value : name;
+ }
+
+ @Override
+ public void remove(final String name) {
+ super.remove(getNormalizedName(name));
+ }
+
+ @Override
+ public void add(final String name, final String value) {
+ String normalized = getNormalizedName(name);
+ super.add(normalized, value);
+ }
+
+ @Override
+ public String[] getValues(final String name) {
+ return super.getValues(getNormalizedName(name));
+ }
+
+ @Override
+ public String get(final String name) {
+ return super.get(getNormalizedName(name));
+ }
+
+ @Override
+ public void set(final String name, final String value) {
+ super.set(getNormalizedName(name), value);
+ }
+
+}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?view=diff&rev=474464&r1=474463&r2=474464
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Mon Nov 13 11:46:56 2006
@@ -17,17 +17,25 @@
package org.apache.nutch.protocol;
-import java.util.*;
-import java.io.*;
-
-import org.apache.hadoop.io.*;
-import org.apache.hadoop.fs.*;
-import org.apache.hadoop.conf.*;
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.ArrayFile;
+import org.apache.hadoop.io.CompressedWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.VersionMismatchException;
import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
+import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.mime.MimeType;
-import org.apache.nutch.util.mime.MimeTypes;
import org.apache.nutch.util.mime.MimeTypeException;
-import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.mime.MimeTypes;
public final class Content extends CompressedWritable {
@@ -36,23 +44,38 @@
private final static byte VERSION = 2;
private byte version;
+
private String url;
+
private String base;
+
private byte[] content;
+
private String contentType;
+
private Metadata metadata;
+
private boolean mimeTypeMagic;
+
private MimeTypes mimeTypes;
- public Content() {}
-
+ private boolean inflated;
+
+ public Content() {
+ inflated = false;
+ }
+
public Content(String url, String base, byte[] content, String contentType,
- Metadata metadata, Configuration conf) {
+ Metadata metadata, Configuration conf) {
- if (url == null) throw new IllegalArgumentException("null url");
- if (base == null) throw new IllegalArgumentException("null base");
- if (content == null) throw new IllegalArgumentException("null content");
- if (metadata == null) throw new IllegalArgumentException("null metadata");
+ if (url == null)
+ throw new IllegalArgumentException("null url");
+ if (base == null)
+ throw new IllegalArgumentException("null base");
+ if (content == null)
+ throw new IllegalArgumentException("null content");
+ if (metadata == null)
+ throw new IllegalArgumentException("null metadata");
this.url = url;
this.base = base;
@@ -61,21 +84,30 @@
this.mimeTypeMagic = conf.getBoolean("mime.type.magic", true);
this.mimeTypes = MimeTypes.get(conf.get("mime.types.file"));
this.contentType = getContentType(contentType, url, content);
+ inflated = true;
+ }
+
+ public void ensureInflated() {
+ if (inflated) {
+ return;
+ }
+ super.ensureInflated();
+ inflated = true;
}
protected final void readFieldsCompressed(DataInput in) throws IOException {
version = in.readByte();
- metadata = new Metadata();
+ metadata = new SpellCheckedMetadata();
switch (version) {
case 0:
case 1:
- url = UTF8.readString(in); // read url
- base = UTF8.readString(in); // read base
+ url = UTF8.readString(in); // read url
+ base = UTF8.readString(in); // read base
- content = new byte[in.readInt()]; // read content
+ content = new byte[in.readInt()]; // read content
in.readFully(content);
- contentType = UTF8.readString(in); // read contentType
+ contentType = UTF8.readString(in); // read contentType
// reconstruct metadata
int keySize = in.readInt();
String key;
@@ -88,33 +120,33 @@
}
break;
case VERSION:
- url = Text.readString(in); // read url
- base = Text.readString(in); // read base
+ url = Text.readString(in); // read url
+ base = Text.readString(in); // read base
- content = new byte[in.readInt()]; // read content
+ content = new byte[in.readInt()]; // read content
in.readFully(content);
- contentType = Text.readString(in); // read contentType
- metadata.readFields(in); // read meta data
+ contentType = Text.readString(in); // read contentType
+ metadata.readFields(in); // read meta data
break;
default:
throw new VersionMismatchException(VERSION, version);
}
-
+
}
protected final void writeCompressed(DataOutput out) throws IOException {
out.writeByte(VERSION);
- Text.writeString(out, url); // write url
- Text.writeString(out, base); // write base
+ Text.writeString(out, url); // write url
+ Text.writeString(out, base); // write base
- out.writeInt(content.length); // write content
+ out.writeInt(content.length); // write content
out.write(content);
- Text.writeString(out, contentType); // write contentType
-
- metadata.write(out); // write metadata
+ Text.writeString(out, contentType); // write contentType
+
+ metadata.write(out); // write metadata
}
public static Content read(DataInput in) throws IOException {
@@ -146,6 +178,7 @@
ensureInflated();
return content;
}
+
public void setContent(byte[] content) {
ensureInflated();
this.content = content;
@@ -159,6 +192,7 @@
ensureInflated();
return contentType;
}
+
public void setContentType(String contentType) {
ensureInflated();
this.contentType = contentType;
@@ -178,30 +212,28 @@
public boolean equals(Object o) {
ensureInflated();
- if (!(o instanceof Content)){
+ if (!(o instanceof Content)) {
return false;
}
- Content that = (Content)o;
+ Content that = (Content) o;
that.ensureInflated();
- return
- this.url.equals(that.url) &&
- this.base.equals(that.base) &&
- Arrays.equals(this.getContent(), that.getContent()) &&
- this.contentType.equals(that.contentType) &&
- this.metadata.equals(that.metadata);
+ return this.url.equals(that.url) && this.base.equals(that.base)
+ && Arrays.equals(this.getContent(), that.getContent())
+ && this.contentType.equals(that.contentType)
+ && this.metadata.equals(that.metadata);
}
public String toString() {
ensureInflated();
StringBuffer buffer = new StringBuffer();
- buffer.append("Version: " + version + "\n" );
- buffer.append("url: " + url + "\n" );
- buffer.append("base: " + base + "\n" );
- buffer.append("contentType: " + contentType + "\n" );
- buffer.append("metadata: " + metadata + "\n" );
+ buffer.append("Version: " + version + "\n");
+ buffer.append("url: " + url + "\n");
+ buffer.append("base: " + base + "\n");
+ buffer.append("contentType: " + contentType + "\n");
+ buffer.append("metadata: " + metadata + "\n");
buffer.append("Content:\n");
- buffer.append(new String(content)); // try default encoding
+ buffer.append(new String(content)); // try default encoding
return buffer.toString();
@@ -210,7 +242,7 @@
public static void main(String argv[]) throws Exception {
String usage = "Content (-local | -dfs <namenode:port>) recno segment";
-
+
if (argv.length < 3) {
System.out.println("usage:" + usage);
return;
@@ -224,7 +256,8 @@
Path file = new Path(segment, DIR_NAME);
System.out.println("Reading from file: " + file);
- ArrayFile.Reader contents = new ArrayFile.Reader(fs, file.toString(), conf);
+ ArrayFile.Reader contents = new ArrayFile.Reader(fs, file.toString(),
+ conf);
Content content = new Content();
contents.get(recno, content);
@@ -241,10 +274,10 @@
private String getContentType(String typeName, String url, byte[] data) {
MimeType type = null;
try {
- typeName = MimeType.clean(typeName);
- type = typeName == null ? null : this.mimeTypes.forName(typeName);
+ typeName = MimeType.clean(typeName);
+ type = typeName == null ? null : this.mimeTypes.forName(typeName);
} catch (MimeTypeException mte) {
- // Seems to be a malformed mime type name...
+ // Seems to be a malformed mime type name...
}
if (typeName == null || type == null || !type.matches(url)) {
@@ -254,8 +287,8 @@
type = this.mimeTypes.getMimeType(url);
typeName = type == null ? typeName : type.getName();
}
- if (typeName == null || type == null ||
- (this.mimeTypeMagic && type.hasMagic() && !type.matches(data))) {
+ if (typeName == null || type == null
+ || (this.mimeTypeMagic && type.hasMagic() && !type.matches(data))) {
// If no mime-type already found, or the one found doesn't match
// the magic bytes it should be, then, guess a mime-type from the
// document content (magic bytes)
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java?view=diff&rev=474464&r1=474463&r2=474464
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestMetadata.java Mon Nov 13 11:46:56 2006
@@ -16,7 +16,6 @@
*/
package org.apache.nutch.metadata;
-// JDK imports
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
@@ -24,71 +23,52 @@
import java.io.IOException;
import java.util.Properties;
import junit.framework.Test;
-
-// JUnit imports
import junit.framework.TestCase;
import junit.framework.TestSuite;
import junit.textui.TestRunner;
-// Nutch imports
-import org.apache.nutch.metadata.Metadata;
-
-
/**
* JUnit based tests of class {@link org.apache.nutch.metadata.Metadata}.
- *
- * @author Chris Mattmann
- * @author Jérôme Charron
*/
public class TestMetadata extends TestCase {
-
+ private static final String CONTENTTYPE = "contenttype";
+
public TestMetadata(String testName) {
super(testName);
}
-
+
public static Test suite() {
return new TestSuite(TestMetadata.class);
}
-
+
public static void main(String[] args) {
TestRunner.run(suite());
}
-
-
- /** Test for the <code>getNormalizedName(String)</code> method. */
- public void testGetNormalizedName() {
- assertEquals("Content-Type", Metadata.getNormalizedName("Content-Type"));
- assertEquals("Content-Type", Metadata.getNormalizedName("ContentType"));
- assertEquals("Content-Type", Metadata.getNormalizedName("Content-type"));
- assertEquals("Content-Type", Metadata.getNormalizedName("contenttype"));
- assertEquals("Content-Type", Metadata.getNormalizedName("contentype"));
- assertEquals("Content-Type", Metadata.getNormalizedName("contntype"));
- }
/** Test for the <code>add(String, String)</code> method. */
public void testAdd() {
String[] values = null;
Metadata meta = new Metadata();
- values = meta.getValues("contentype");
+ values = meta.getValues(CONTENTTYPE);
assertEquals(0, values.length);
- meta.add("contentype", "value1");
- values = meta.getValues("contentype");
+ meta.add(CONTENTTYPE, "value1");
+ values = meta.getValues(CONTENTTYPE);
assertEquals(1, values.length);
assertEquals("value1", values[0]);
-
- meta.add("Content-Type", "value2");
- values = meta.getValues("contentype");
+
+ meta.add(CONTENTTYPE, "value2");
+ values = meta.getValues(CONTENTTYPE);
assertEquals(2, values.length);
assertEquals("value1", values[0]);
assertEquals("value2", values[1]);
// NOTE : For now, the same value can be added many times.
- // Should it be changed?
- meta.add("ContentType", "value1");
- values = meta.getValues("Content-Type");
+ // Should it be changed?
+ meta.add(CONTENTTYPE, "value1");
+ values = meta.getValues(CONTENTTYPE);
assertEquals(3, values.length);
assertEquals("value1", values[0]);
assertEquals("value2", values[1]);
@@ -100,43 +80,43 @@
String[] values = null;
Metadata meta = new Metadata();
- values = meta.getValues("contentype");
+ values = meta.getValues(CONTENTTYPE);
assertEquals(0, values.length);
- meta.set("contentype", "value1");
- values = meta.getValues("contentype");
+ meta.set(CONTENTTYPE, "value1");
+ values = meta.getValues(CONTENTTYPE);
assertEquals(1, values.length);
assertEquals("value1", values[0]);
-
- meta.set("Content-Type", "value2");
- values = meta.getValues("contentype");
+
+ meta.set(CONTENTTYPE, "value2");
+ values = meta.getValues(CONTENTTYPE);
assertEquals(1, values.length);
assertEquals("value2", values[0]);
-
- meta.set("contenttype", "new value 1");
+
+ meta.set(CONTENTTYPE, "new value 1");
meta.add("contenttype", "new value 2");
- values = meta.getValues("contentype");
+ values = meta.getValues(CONTENTTYPE);
assertEquals(2, values.length);
assertEquals("new value 1", values[0]);
assertEquals("new value 2", values[1]);
}
-
- /** Test for <code>setAll(Properties)</code> method */
+
+ /** Test for <code>setAll(Properties)</code> method. */
public void testSetProperties() {
String[] values = null;
Metadata meta = new Metadata();
Properties props = new Properties();
-
+
meta.setAll(props);
assertEquals(0, meta.size());
-
+
props.setProperty("name-one", "value1.1");
meta.setAll(props);
assertEquals(1, meta.size());
values = meta.getValues("name-one");
assertEquals(1, values.length);
assertEquals("value1.1", values[0]);
-
+
props.setProperty("name-two", "value2.1");
meta.setAll(props);
assertEquals(2, meta.size());
@@ -147,19 +127,18 @@
assertEquals(1, values.length);
assertEquals("value2.1", values[0]);
}
-
- /** Test for <code>get(String)</code> method */
+
+ /** Test for <code>get(String)</code> method. */
public void testGet() {
Metadata meta = new Metadata();
assertNull(meta.get("a-name"));
-
meta.add("a-name", "value-1");
assertEquals("value-1", meta.get("a-name"));
meta.add("a-name", "value-2");
assertEquals("value-1", meta.get("a-name"));
}
-
- /** Test for <code>isMultiValued()</code> method */
+
+ /** Test for <code>isMultiValued()</code> method. */
public void testIsMultiValued() {
Metadata meta = new Metadata();
assertFalse(meta.isMultiValued("key"));
@@ -169,13 +148,13 @@
assertTrue(meta.isMultiValued("key"));
}
- /** Test for <code>names</code> method */
+ /** Test for <code>names</code> method. */
public void testNames() {
String[] names = null;
Metadata meta = new Metadata();
names = meta.names();
assertEquals(0, names.length);
-
+
meta.add("name-one", "value");
names = meta.names();
assertEquals(1, names.length);
@@ -184,8 +163,8 @@
names = meta.names();
assertEquals(2, names.length);
}
-
- /** Test for <code>remove(String)</code> method */
+
+ /** Test for <code>remove(String)</code> method. */
public void testRemove() {
Metadata meta = new Metadata();
meta.remove("name-one");
@@ -206,7 +185,7 @@
assertNull(meta.get("name-two"));
}
- /** Test for <code>equals(Object)</code> method */
+ /** Test for <code>equals(Object)</code> method. */
public void testObject() {
Metadata meta1 = new Metadata();
Metadata meta2 = new Metadata();
@@ -230,8 +209,8 @@
meta2.add("name-two", "value-2.x");
assertFalse(meta1.equals(meta2));
}
-
- /** Test for <code>Writable</code> implementation */
+
+ /** Test for <code>Writable</code> implementation. */
public void testWritable() {
Metadata result = null;
Metadata meta = new Metadata();
@@ -252,17 +231,19 @@
assertEquals("value-2.1", result.getValues("name-two")[0]);
assertEquals("value-2.2", result.getValues("name-two")[1]);
}
-
+
private Metadata writeRead(Metadata meta) {
Metadata readed = new Metadata();
try {
ByteArrayOutputStream out = new ByteArrayOutputStream();
meta.write(new DataOutputStream(out));
- readed.readFields(new DataInputStream(new ByteArrayInputStream(out.toByteArray())));
+ readed.readFields(new DataInputStream(new ByteArrayInputStream(out
+ .toByteArray())));
} catch (IOException ioe) {
fail(ioe.toString());
}
return readed;
}
-
+
}
+
Added: lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java?view=auto&rev=474464
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java (added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java Mon Nov 13 11:46:56 2006
@@ -0,0 +1,267 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.metadata;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.util.Properties;
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+import junit.textui.TestRunner;
+
+/**
+ * JUnit based tests of class
+ * {@link org.apache.nutch.metadata.SpellCheckedMetadata}.
+ *
+ * @author Chris Mattmann
+ * @author Jérôme Charron
+ */
+public class TestSpellCheckedMetadata extends TestCase {
+
+ public TestSpellCheckedMetadata(String testName) {
+ super(testName);
+ }
+
+ public static Test suite() {
+ return new TestSuite(TestSpellCheckedMetadata.class);
+ }
+
+ public static void main(String[] args) {
+ TestRunner.run(suite());
+ }
+
+ /** Test for the <code>getNormalizedName(String)</code> method. */
+ public void testGetNormalizedName() {
+ assertEquals("Content-Type", SpellCheckedMetadata
+ .getNormalizedName("Content-Type"));
+ assertEquals("Content-Type", SpellCheckedMetadata
+ .getNormalizedName("ContentType"));
+ assertEquals("Content-Type", SpellCheckedMetadata
+ .getNormalizedName("Content-type"));
+ assertEquals("Content-Type", SpellCheckedMetadata
+ .getNormalizedName("contenttype"));
+ assertEquals("Content-Type", SpellCheckedMetadata
+ .getNormalizedName("contentype"));
+ assertEquals("Content-Type", SpellCheckedMetadata
+ .getNormalizedName("contntype"));
+ }
+
+ /** Test for the <code>add(String, String)</code> method. */
+ public void testAdd() {
+ String[] values = null;
+ SpellCheckedMetadata meta = new SpellCheckedMetadata();
+
+ values = meta.getValues("contentype");
+ assertEquals(0, values.length);
+
+ meta.add("contentype", "value1");
+ values = meta.getValues("contentype");
+ assertEquals(1, values.length);
+ assertEquals("value1", values[0]);
+
+ meta.add("Content-Type", "value2");
+ values = meta.getValues("contentype");
+ assertEquals(2, values.length);
+ assertEquals("value1", values[0]);
+ assertEquals("value2", values[1]);
+
+ // NOTE : For now, the same value can be added many times.
+ // Should it be changed?
+ meta.add("ContentType", "value1");
+ values = meta.getValues("Content-Type");
+ assertEquals(3, values.length);
+ assertEquals("value1", values[0]);
+ assertEquals("value2", values[1]);
+ assertEquals("value1", values[2]);
+ }
+
+ /** Test for the <code>set(String, String)</code> method. */
+ public void testSet() {
+ String[] values = null;
+ SpellCheckedMetadata meta = new SpellCheckedMetadata();
+
+ values = meta.getValues("contentype");
+ assertEquals(0, values.length);
+
+ meta.set("contentype", "value1");
+ values = meta.getValues("contentype");
+ assertEquals(1, values.length);
+ assertEquals("value1", values[0]);
+
+ meta.set("Content-Type", "value2");
+ values = meta.getValues("contentype");
+ assertEquals(1, values.length);
+ assertEquals("value2", values[0]);
+
+ meta.set("contenttype", "new value 1");
+ meta.add("contenttype", "new value 2");
+ values = meta.getValues("contentype");
+ assertEquals(2, values.length);
+ assertEquals("new value 1", values[0]);
+ assertEquals("new value 2", values[1]);
+ }
+
+ /** Test for <code>setAll(Properties)</code> method. */
+ public void testSetProperties() {
+ String[] values = null;
+ SpellCheckedMetadata meta = new SpellCheckedMetadata();
+ Properties props = new Properties();
+
+ meta.setAll(props);
+ assertEquals(0, meta.size());
+
+ props.setProperty("name-one", "value1.1");
+ meta.setAll(props);
+ assertEquals(1, meta.size());
+ values = meta.getValues("name-one");
+ assertEquals(1, values.length);
+ assertEquals("value1.1", values[0]);
+
+ props.setProperty("name-two", "value2.1");
+ meta.setAll(props);
+ assertEquals(2, meta.size());
+ values = meta.getValues("name-one");
+ assertEquals(1, values.length);
+ assertEquals("value1.1", values[0]);
+ values = meta.getValues("name-two");
+ assertEquals(1, values.length);
+ assertEquals("value2.1", values[0]);
+ }
+
+ /** Test for <code>get(String)</code> method. */
+ public void testGet() {
+ SpellCheckedMetadata meta = new SpellCheckedMetadata();
+ assertNull(meta.get("a-name"));
+
+ meta.add("a-name", "value-1");
+ assertEquals("value-1", meta.get("a-name"));
+ meta.add("a-name", "value-2");
+ assertEquals("value-1", meta.get("a-name"));
+ }
+
+ /** Test for <code>isMultiValued()</code> method. */
+ public void testIsMultiValued() {
+ SpellCheckedMetadata meta = new SpellCheckedMetadata();
+ assertFalse(meta.isMultiValued("key"));
+ meta.add("key", "value1");
+ assertFalse(meta.isMultiValued("key"));
+ meta.add("key", "value2");
+ assertTrue(meta.isMultiValued("key"));
+ }
+
+ /** Test for <code>names</code> method. */
+ public void testNames() {
+ String[] names = null;
+ SpellCheckedMetadata meta = new SpellCheckedMetadata();
+ names = meta.names();
+ assertEquals(0, names.length);
+
+ meta.add("name-one", "value");
+ names = meta.names();
+ assertEquals(1, names.length);
+ assertEquals("name-one", names[0]);
+ meta.add("name-two", "value");
+ names = meta.names();
+ assertEquals(2, names.length);
+ }
+
+ /** Test for <code>remove(String)</code> method. */
+ public void testRemove() {
+ SpellCheckedMetadata meta = new SpellCheckedMetadata();
+ meta.remove("name-one");
+ assertEquals(0, meta.size());
+ meta.add("name-one", "value-1.1");
+ meta.add("name-one", "value-1.2");
+ meta.add("name-two", "value-2.2");
+ assertEquals(2, meta.size());
+ assertNotNull(meta.get("name-one"));
+ assertNotNull(meta.get("name-two"));
+ meta.remove("name-one");
+ assertEquals(1, meta.size());
+ assertNull(meta.get("name-one"));
+ assertNotNull(meta.get("name-two"));
+ meta.remove("name-two");
+ assertEquals(0, meta.size());
+ assertNull(meta.get("name-one"));
+ assertNull(meta.get("name-two"));
+ }
+
+ /** Test for <code>equals(Object)</code> method. */
+ public void testObject() {
+ SpellCheckedMetadata meta1 = new SpellCheckedMetadata();
+ SpellCheckedMetadata meta2 = new SpellCheckedMetadata();
+ assertFalse(meta1.equals(null));
+ assertFalse(meta1.equals("String"));
+ assertTrue(meta1.equals(meta2));
+ meta1.add("name-one", "value-1.1");
+ assertFalse(meta1.equals(meta2));
+ meta2.add("name-one", "value-1.1");
+ assertTrue(meta1.equals(meta2));
+ meta1.add("name-one", "value-1.2");
+ assertFalse(meta1.equals(meta2));
+ meta2.add("name-one", "value-1.2");
+ assertTrue(meta1.equals(meta2));
+ meta1.add("name-two", "value-2.1");
+ assertFalse(meta1.equals(meta2));
+ meta2.add("name-two", "value-2.1");
+ assertTrue(meta1.equals(meta2));
+ meta1.add("name-two", "value-2.2");
+ assertFalse(meta1.equals(meta2));
+ meta2.add("name-two", "value-2.x");
+ assertFalse(meta1.equals(meta2));
+ }
+
+ /** Test for <code>Writable</code> implementation. */
+ public void testWritable() {
+ SpellCheckedMetadata result = null;
+ SpellCheckedMetadata meta = new SpellCheckedMetadata();
+ result = writeRead(meta);
+ assertEquals(0, result.size());
+ meta.add("name-one", "value-1.1");
+ result = writeRead(meta);
+ assertEquals(1, result.size());
+ assertEquals(1, result.getValues("name-one").length);
+ assertEquals("value-1.1", result.get("name-one"));
+ meta.add("name-two", "value-2.1");
+ meta.add("name-two", "value-2.2");
+ result = writeRead(meta);
+ assertEquals(2, result.size());
+ assertEquals(1, result.getValues("name-one").length);
+ assertEquals("value-1.1", result.getValues("name-one")[0]);
+ assertEquals(2, result.getValues("name-two").length);
+ assertEquals("value-2.1", result.getValues("name-two")[0]);
+ assertEquals("value-2.2", result.getValues("name-two")[1]);
+ }
+
+ private SpellCheckedMetadata writeRead(SpellCheckedMetadata meta) {
+ SpellCheckedMetadata readed = new SpellCheckedMetadata();
+ try {
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ meta.write(new DataOutputStream(out));
+ readed.readFields(new DataInputStream(new ByteArrayInputStream(out
+ .toByteArray())));
+ } catch (IOException ioe) {
+ fail(ioe.toString());
+ }
+ return readed;
+ }
+
+}
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java?view=diff&rev=474464&r1=474463&r2=474464
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/protocol/TestContent.java Mon Nov 13 11:46:56 2006
@@ -18,6 +18,7 @@
package org.apache.nutch.protocol;
import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.WritableTestUtils;
@@ -28,9 +29,9 @@
/** Unit tests for Content. */
public class TestContent extends TestCase {
-
+
private static Configuration conf = NutchConfiguration.create();
-
+
public TestContent(String name) { super(name); }
public void testContent() throws Exception {
@@ -39,13 +40,13 @@
String url = "http://www.foo.com/";
- Metadata metaData = new Metadata();
+ SpellCheckedMetadata metaData = new SpellCheckedMetadata();
metaData.add("Host", "www.foo.com");
metaData.add("Content-Type", "text/html");
Content r = new Content(url, url, page.getBytes("UTF8"), "text/html",
metaData, conf);
-
+
WritableTestUtils.testWritable(r);
assertEquals("text/html", r.getMetadata().get("Content-Type"));
assertEquals("text/html", r.getMetadata().get("content-type"));
@@ -105,5 +106,5 @@
null, p, conf);
assertNull(c.getContentType());
}
-
+
}