You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/11/28 22:02:12 UTC
svn commit: r480207 - in /lucene/nutch/trunk/src:
java/org/apache/nutch/metadata/ java/org/apache/nutch/protocol/
plugin/protocol-http/src/java/org/apache/nutch/protocol/http/
plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/
Author: ab
Date: Tue Nov 28 13:02:10 2006
New Revision: 480207
URL: http://svn.apache.org/viewvc?view=rev&rev=480207
Log:
Use SpellCheckedMetadata only when necessary, i.e. only when collecting
metadata from unreliable sources such as HTTP headers.
* Metadata: fix a bug where SpellCheckedMetadata would try to normalize
metadata names during (de)serialization.
* Content: should use regular Metadata by default, and when de-serializing.
* fix HTTP protocol plugins to use SpellCheckedMetadata, where it's really
necessary.
Modified:
lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java
lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java?view=diff&rev=480207&r1=480206&r2=480207
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java Tue Nov 28 13:02:10 2006
@@ -92,6 +92,10 @@
* @return the values associated to a metadata name.
*/
public String[] getValues(final String name) {
+ return _getValues(name);
+ }
+
+ private String[] _getValues(final String name) {
String[] values = metadata.get(name);
if (values == null) {
values = new String[0];
@@ -174,8 +178,8 @@
String[] names = names();
for (int i = 0; i < names.length; i++) {
- String[] otherValues = other.getValues(names[i]);
- String[] thisValues = getValues(names[i]);
+ String[] otherValues = other._getValues(names[i]);
+ String[] thisValues = _getValues(names[i]);
if (otherValues.length != thisValues.length) {
return false;
}
@@ -192,7 +196,7 @@
StringBuffer buf = new StringBuffer();
String[] names = names();
for (int i = 0; i < names.length; i++) {
- String[] values = getValues(names[i]);
+ String[] values = _getValues(names[i]);
for (int j = 0; j < values.length; j++) {
buf.append(names[i])
.append("=")
@@ -209,7 +213,7 @@
String[] names = names();
for (int i = 0; i < names.length; i++) {
Text.writeString(out, names[i]);
- values = getValues(names[i]);
+ values = _getValues(names[i]);
int cnt = 0;
for (int j = 0; j < values.length; j++) {
if (values[j] != null)
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?view=diff&rev=480207&r1=480206&r2=480207
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Tue Nov 28 13:02:10 2006
@@ -31,7 +31,6 @@
import org.apache.hadoop.io.UTF8;
import org.apache.hadoop.io.VersionMismatchException;
import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.metadata.SpellCheckedMetadata;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.mime.MimeType;
import org.apache.nutch.util.mime.MimeTypeException;
@@ -97,7 +96,7 @@
protected final void readFieldsCompressed(DataInput in) throws IOException {
version = in.readByte();
- metadata = new SpellCheckedMetadata();
+ metadata = new Metadata();
switch (version) {
case 0:
case 1:
Modified: lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java?view=diff&rev=480207&r1=480206&r2=480207
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java Tue Nov 28 13:02:10 2006
@@ -31,6 +31,7 @@
// Nutch imports
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.http.api.HttpBase;
@@ -47,7 +48,7 @@
private String base;
private byte[] content;
private int code;
- private Metadata headers = new Metadata();
+ private Metadata headers = new SpellCheckedMetadata();
public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java?view=diff&rev=480207&r1=480206&r2=480207
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java Tue Nov 28 13:02:10 2006
@@ -37,6 +37,7 @@
// Nutch imports
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.metadata.SpellCheckedMetadata;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.http.api.HttpBase;
import org.apache.nutch.util.LogUtil;
@@ -61,7 +62,7 @@
private int code;
- private Metadata headers = new Metadata();
+ private Metadata headers = new SpellCheckedMetadata();
public HttpResponse(HttpBase http, URL url, CrawlDatum datum) throws IOException {