You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/10/16 22:39:02 UTC
svn commit: r464654 [2/2] - in /lucene/nutch/trunk: ./ bin/ lib/
src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/
src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/
src/java/org/apache/nutch/parse/ src/java/org/ap...
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java Mon Oct 16 13:38:57 2006
@@ -32,7 +32,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
/**
* Creates and caches {@link ScoringFilter} implementing plugins.
@@ -85,7 +85,7 @@
}
/** Calculate a sort value for Generate. */
- public float generatorSortValue(UTF8 url, CrawlDatum datum, float initSort) throws ScoringFilterException {
+ public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
initSort = this.filters[i].generatorSortValue(url, datum, initSort);
}
@@ -93,46 +93,46 @@
}
/** Calculate a new initial score, used when adding newly discovered pages. */
- public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
+ public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
this.filters[i].initialScore(url, datum);
}
}
/** Calculate a new initial score, used when injecting new pages. */
- public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
+ public void injectedScore(Text url, CrawlDatum datum) throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
this.filters[i].injectedScore(url, datum);
}
}
/** Calculate updated page score during CrawlDb.update(). */
- public void updateDbScore(UTF8 url, CrawlDatum old, CrawlDatum datum, List inlinked) throws ScoringFilterException {
+ public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List inlinked) throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
this.filters[i].updateDbScore(url, old, datum, inlinked);
}
}
- public void passScoreBeforeParsing(UTF8 url, CrawlDatum datum, Content content) throws ScoringFilterException {
+ public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
this.filters[i].passScoreBeforeParsing(url, datum, content);
}
}
- public void passScoreAfterParsing(UTF8 url, Content content, Parse parse) throws ScoringFilterException {
+ public void passScoreAfterParsing(Text url, Content content, Parse parse) throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
this.filters[i].passScoreAfterParsing(url, content, parse);
}
}
- public CrawlDatum distributeScoreToOutlink(UTF8 fromUrl, UTF8 toUrl, ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount, int validCount) throws ScoringFilterException {
+ public CrawlDatum distributeScoreToOutlink(Text fromUrl, Text toUrl, ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount, int validCount) throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
adjust = this.filters[i].distributeScoreToOutlink(fromUrl, toUrl, parseData, target, adjust, allCount, validCount);
}
return adjust;
}
- public float indexerScore(UTF8 url, Document doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException {
+ public float indexerScore(Text url, Document doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException {
for (int i = 0; i < this.filters.length; i++) {
initScore = this.filters[i].indexerScore(url, doc, dbDatum, fetchDatum, parse, inlinks, initScore);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/DistributedSearch.java Mon Oct 16 13:38:57 2006
@@ -76,7 +76,7 @@
static org.apache.hadoop.ipc.Server getServer(Configuration conf, Path directory, int port) throws IOException{
NutchBean bean = new NutchBean(conf, directory);
- return RPC.getServer(bean, port, 10, true, conf);
+ return RPC.getServer(bean, "0.0.0.0", port, 10, true, conf);
}
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/FetchedSegments.java Mon Oct 16 13:38:57 2006
@@ -53,7 +53,7 @@
this.conf = conf;
}
- public CrawlDatum getCrawlDatum(UTF8 url) throws IOException {
+ public CrawlDatum getCrawlDatum(Text url) throws IOException {
synchronized (this) {
if (crawl == null)
crawl = getReaders(CrawlDatum.FETCH_DIR_NAME);
@@ -61,7 +61,7 @@
return (CrawlDatum)getEntry(crawl, url, new CrawlDatum());
}
- public byte[] getContent(UTF8 url) throws IOException {
+ public byte[] getContent(Text url) throws IOException {
synchronized (this) {
if (content == null)
content = getReaders(Content.DIR_NAME);
@@ -69,7 +69,7 @@
return ((Content)getEntry(content, url, new Content())).getContent();
}
- public ParseData getParseData(UTF8 url) throws IOException {
+ public ParseData getParseData(Text url) throws IOException {
synchronized (this) {
if (parseData == null)
parseData = getReaders(ParseData.DIR_NAME);
@@ -77,7 +77,7 @@
return (ParseData)getEntry(parseData, url, new ParseData());
}
- public ParseText getParseText(UTF8 url) throws IOException {
+ public ParseText getParseText(Text url) throws IOException {
synchronized (this) {
if (parseText == null)
parseText = getReaders(ParseText.DIR_NAME);
@@ -89,7 +89,7 @@
return MapFileOutputFormat.getReaders(fs, new Path(segmentDir, subDir), this.conf);
}
- private Writable getEntry(MapFile.Reader[] readers, UTF8 url,
+ private Writable getEntry(MapFile.Reader[] readers, Text url,
Writable entry) throws IOException {
return MapFileOutputFormat.getEntry(readers, PARTITIONER, url, entry);
}
@@ -212,8 +212,8 @@
return (Segment)segments.get(details.getValue("segment"));
}
- private UTF8 getUrl(HitDetails details) {
- return new UTF8(details.getValue("url"));
+ private Text getUrl(HitDetails details) {
+ return new Text(details.getValue("url"));
}
public void close() throws IOException {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Hits.java Mon Oct 16 13:38:57 2006
@@ -22,7 +22,7 @@
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
/** A set of hits matching a query. */
public final class Hits implements Writable {
@@ -69,13 +69,13 @@
out.writeLong(total); // write total hits
out.writeInt(top.length); // write hits returned
if (top.length > 0) // write sort value class
- UTF8.writeString(out, top[0].getSortValue().getClass().getName());
+ Text.writeString(out, top[0].getSortValue().getClass().getName());
for (int i = 0; i < top.length; i++) {
Hit h = top[i];
out.writeInt(h.getIndexDocNo()); // write indexDocNo
h.getSortValue().write(out); // write sortValue
- UTF8.writeString(out, h.getDedupValue()); // write dedupValue
+ Text.writeString(out, h.getDedupValue()); // write dedupValue
}
}
@@ -85,7 +85,7 @@
Class sortClass = null;
if (top.length > 0) { // read sort value class
try {
- sortClass = Class.forName(UTF8.readString(in));
+ sortClass = Class.forName(Text.readString(in));
} catch (ClassNotFoundException e) {
throw new IOException(e.toString());
}
@@ -102,7 +102,7 @@
}
sortValue.readFields(in); // read sortValue
- String dedupValue = UTF8.readString(in); // read dedupValue
+ String dedupValue = Text.readString(in); // read dedupValue
top[i] = new Hit(indexDocNo, sortValue, dedupValue);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/IndexSearcher.java Mon Oct 16 13:38:57 2006
@@ -152,7 +152,7 @@
} else if (raw instanceof Float) {
sortValue = new FloatWritable(((Float)raw).floatValue());
} else if (raw instanceof String) {
- sortValue = new UTF8((String)raw);
+ sortValue = new Text((String)raw);
} else {
throw new RuntimeException("Unknown sort value type!");
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/LinkDbInlinks.java Mon Oct 16 13:38:57 2006
@@ -12,7 +12,7 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
@@ -30,11 +30,11 @@
}
public String[] getAnchors(HitDetails details) throws IOException {
- return linkdb.getAnchors(new UTF8(details.getValue("url")));
+ return linkdb.getAnchors(new Text(details.getValue("url")));
}
public Inlinks getInlinks(HitDetails details) throws IOException {
- return linkdb.getInlinks(new UTF8(details.getValue("url")));
+ return linkdb.getInlinks(new Text(details.getValue("url")));
}
public void close() throws IOException {
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summary.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summary.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summary.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/searcher/Summary.java Mon Oct 16 13:38:57 2006
@@ -23,7 +23,7 @@
import java.util.ArrayList;
// Hadoop imports
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
// Nutch imports
@@ -187,12 +187,12 @@
fragment = (Fragment) fragments.get(i);
if (fragment.isHighlight()) {
out.writeByte(HIGHLIGHT);
- UTF8.writeString(out, fragment.getText());
+ Text.writeString(out, fragment.getText());
} else if (fragment.isEllipsis()) {
out.writeByte(ELLIPSIS);
} else {
out.writeByte(FRAGMENT);
- UTF8.writeString(out, fragment.getText());
+ Text.writeString(out, fragment.getText());
}
}
}
@@ -204,11 +204,11 @@
for (int i=0; i<nbFragments; i++) {
int type = in.readByte();
if (type == HIGHLIGHT) {
- fragment = new Highlight(UTF8.readString(in));
+ fragment = new Highlight(Text.readString(in));
} else if (type == ELLIPSIS) {
fragment = new Ellipsis();
} else {
- fragment = new Fragment(UTF8.readString(in));
+ fragment = new Fragment(Text.readString(in));
}
fragments.add(fragment);
}
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Mon Oct 16 13:38:57 2006
@@ -89,10 +89,10 @@
public class SegmentMerger extends Configured implements Mapper, Reducer {
private static final Log LOG = LogFactory.getLog(SegmentMerger.class);
- private static final UTF8 SEGMENT_PART_KEY = new UTF8("_PaRt_");
- private static final UTF8 SEGMENT_NAME_KEY = new UTF8("_NaMe_");
+ private static final Text SEGMENT_PART_KEY = new Text("_PaRt_");
+ private static final Text SEGMENT_NAME_KEY = new Text("_NaMe_");
private static final String nameMarker = SEGMENT_NAME_KEY.toString();
- private static final UTF8 SEGMENT_SLICE_KEY = new UTF8("_SlIcE_");
+ private static final Text SEGMENT_SLICE_KEY = new Text("_SlIcE_");
private static final String sliceMarker = SEGMENT_SLICE_KEY.toString();
private URLFilters filters = null;
@@ -140,8 +140,8 @@
Object o = wrapper.get();
if (o instanceof CrawlDatum) {
// record which part of segment this comes from
- ((CrawlDatum)o).getMetaData().put(SEGMENT_PART_KEY, new UTF8(part));
- ((CrawlDatum)o).getMetaData().put(SEGMENT_NAME_KEY, new UTF8(segment));
+ ((CrawlDatum)o).getMetaData().put(SEGMENT_PART_KEY, new Text(part));
+ ((CrawlDatum)o).getMetaData().put(SEGMENT_NAME_KEY, new Text(segment));
} else if (o instanceof Content) {
if (((Content)o).getMetadata() == null) {
((Content)o).setMetadata(new Metadata());
@@ -162,6 +162,10 @@
}
return res;
}
+
+ public Writable createValue() {
+ return new ObjectWritable();
+ }
};
}
}
@@ -186,12 +190,12 @@
String slice = null;
if (o instanceof CrawlDatum) {
// check which output dir it should go into
- UTF8 part = (UTF8)((CrawlDatum)o).getMetaData().get(SEGMENT_PART_KEY);
+ Text part = (Text)((CrawlDatum)o).getMetaData().get(SEGMENT_PART_KEY);
((CrawlDatum)o).getMetaData().remove(SEGMENT_PART_KEY);
((CrawlDatum)o).getMetaData().remove(SEGMENT_NAME_KEY);
if (part == null)
throw new IOException("Null segment part, key=" + key);
- UTF8 uSlice = (UTF8)((CrawlDatum)o).getMetaData().get(SEGMENT_SLICE_KEY);
+ Text uSlice = (Text)((CrawlDatum)o).getMetaData().get(SEGMENT_SLICE_KEY);
((CrawlDatum)o).getMetaData().remove(SEGMENT_SLICE_KEY);
if (uSlice != null) slice = uSlice.toString();
String partString = part.toString();
@@ -267,7 +271,7 @@
} else {
wname = new Path(new Path(new Path(job.getOutputPath(), segmentName + "-" + slice), dirName), name);
}
- res = new SequenceFile.Writer(fs, wname, UTF8.class, CrawlDatum.class);
+ res = new SequenceFile.Writer(fs, job, wname, Text.class, CrawlDatum.class);
sliceWriters.put(slice + dirName, res);
return res;
}
@@ -283,7 +287,7 @@
} else {
wname = new Path(new Path(new Path(job.getOutputPath(), segmentName + "-" + slice), dirName), name);
}
- res = new MapFile.Writer(fs, wname.toString(), UTF8.class, clazz);
+ res = new MapFile.Writer(fs, wname.toString(), Text.class, clazz);
sliceWriters.put(slice + dirName, res);
return res;
}
@@ -332,10 +336,17 @@
}
}
+ private Text newKey = new Text();
+
public void map(WritableComparable key, Writable value, OutputCollector output, Reporter reporter) throws IOException {
+ // convert on the fly from the old format
+ if (key instanceof UTF8) {
+ newKey.set(key.toString());
+ key = newKey;
+ }
if (filters != null) {
try {
- if (filters.filter(((UTF8)key).toString()) == null) {
+ if (filters.filter(((Text)key).toString()) == null) {
return;
}
} catch (Exception e) {
@@ -373,10 +384,10 @@
if (o instanceof CrawlDatum) {
CrawlDatum val = (CrawlDatum)o;
// check which output dir it belongs to
- UTF8 part = (UTF8)val.getMetaData().get(SEGMENT_PART_KEY);
+ Text part = (Text)val.getMetaData().get(SEGMENT_PART_KEY);
if (part == null)
throw new IOException("Null segment part, key=" + key);
- UTF8 uName = (UTF8)val.getMetaData().get(SEGMENT_NAME_KEY);
+ Text uName = (Text)val.getMetaData().get(SEGMENT_NAME_KEY);
if (uName == null)
throw new IOException("Null segment name, key=" + key);
String name = uName.toString();
@@ -470,10 +481,10 @@
}
}
curCount++;
- UTF8 sliceName = null;
+ Text sliceName = null;
ObjectWritable wrapper = new ObjectWritable();
if (sliceSize > 0) {
- sliceName = new UTF8(String.valueOf(curCount / sliceSize));
+ sliceName = new Text(String.valueOf(curCount / sliceSize));
}
// now output the latest values
if (lastG != null) {
@@ -613,12 +624,10 @@
}
}
job.setInputFormat(ObjectInputFormat.class);
- job.setInputKeyClass(UTF8.class);
- job.setInputValueClass(ObjectWritable.class);
job.setMapperClass(SegmentMerger.class);
job.setReducerClass(SegmentMerger.class);
job.setOutputPath(out);
- job.setOutputKeyClass(UTF8.class);
+ job.setOutputKeyClass(Text.class);
job.setOutputValueClass(ObjectWritable.class);
job.setOutputFormat(SegmentOutputFormat.class);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Mon Oct 16 13:38:57 2006
@@ -65,9 +65,27 @@
}
return super.next(key, (Writable) wrapper.get());
}
+
+ public Writable createValue() {
+ return new ObjectWritable();
+ }
};
}
}
+
+ public static class InputCompatMapper extends MapReduceBase implements Mapper {
+ private Text newKey = new Text();
+
+ public void map(WritableComparable key, Writable value, OutputCollector collector, Reporter reporter) throws IOException {
+ // convert on the fly from old formats with UTF8 keys
+ if (key instanceof UTF8) {
+ newKey.set(key.toString());
+ key = newKey;
+ }
+ collector.collect(key, value);
+ }
+
+ }
/** Implements a text output format */
public static class TextOutputFormat extends org.apache.hadoop.mapred.OutputFormatBase {
@@ -180,9 +198,7 @@
if (pt) job.addInputPath(new Path(segment, ParseText.DIR_NAME));
job.setInputFormat(InputFormat.class);
- job.setInputKeyClass(UTF8.class);
- job.setInputValueClass(ObjectWritable.class);
-
+ job.setMapperClass(InputCompatMapper.class);
job.setReducerClass(SegmentReader.class);
Path tempDir = new Path("/tmp/segread-" + new java.util.Random().nextInt());
@@ -190,7 +206,7 @@
job.setOutputPath(tempDir);
job.setOutputFormat(TextOutputFormat.class);
- job.setOutputKeyClass(UTF8.class);
+ job.setOutputKeyClass(Text.class);
job.setOutputValueClass(ObjectWritable.class);
JobClient.runJob(job);
@@ -255,7 +271,7 @@
{"pt", "ParseText::\n"}
};
- public void get(final Path segment, final UTF8 key, Writer writer,
+ public void get(final Path segment, final Text key, Writer writer,
final Map results) throws Exception {
if (LOG.isInfoEnabled()) { LOG.info("SegmentReader: get '" + key + "'"); }
ArrayList threads = new ArrayList();
@@ -346,12 +362,12 @@
}
}
- private List getMapRecords(Path dir, UTF8 key) throws Exception {
+ private List getMapRecords(Path dir, Text key) throws Exception {
MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, dir, getConf());
ArrayList res = new ArrayList();
Class keyClass = readers[0].getKeyClass();
Class valueClass = readers[0].getValueClass();
- if (!keyClass.getName().equals("org.apache.hadoop.io.UTF8"))
+ if (!keyClass.getName().equals("org.apache.hadoop.io.Text"))
throw new IOException("Incompatible key (" + keyClass.getName() + ")");
Writable value = (Writable)valueClass.newInstance();
// we don't know the partitioning schema
@@ -363,12 +379,12 @@
return res;
}
- private List getSeqRecords(Path dir, UTF8 key) throws Exception {
+ private List getSeqRecords(Path dir, Text key) throws Exception {
SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(getConf(), dir);
ArrayList res = new ArrayList();
Class keyClass = readers[0].getKeyClass();
Class valueClass = readers[0].getValueClass();
- if (!keyClass.getName().equals("org.apache.hadoop.io.UTF8"))
+ if (!keyClass.getName().equals("org.apache.hadoop.io.Text"))
throw new IOException("Incompatible key (" + keyClass.getName() + ")");
Writable aKey = (Writable)keyClass.newInstance();
Writable value = (Writable)valueClass.newInstance();
@@ -423,7 +439,7 @@
public void getStats(Path segment, final SegmentReaderStats stats) throws Exception {
SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(getConf(), new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
long cnt = 0L;
- UTF8 key = new UTF8();
+ Text key = new Text();
for (int i = 0; i < readers.length; i++) {
while (readers[i].next(key)) cnt++;
readers[i].close();
@@ -566,7 +582,7 @@
usage();
return;
}
- segmentReader.get(new Path(input), new UTF8(key), new OutputStreamWriter(System.out, "UTF-8"), new HashMap());
+ segmentReader.get(new Path(input), new Text(key), new OutputStreamWriter(System.out, "UTF-8"), new HashMap());
return;
default:
System.err.println("Invalid operation: " + args[0]);
Modified: lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/DmozParser.java Mon Oct 16 13:38:57 2006
@@ -291,8 +291,8 @@
if (LOG.isInfoEnabled()) { LOG.info("skew = " + rp.hashSkew); }
//
- // Open filtered text stream. The UTF8Filter makes sure that
- // only appropriate XML-approved UTF8 characters are received.
+ // Open filtered text stream. The TextFilter makes sure that
+ // only appropriate XML-approved Text characters are received.
// Any non-conforming characters are silently skipped.
//
XMLCharFilter in = new XMLCharFilter(new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(dmozFile)), "UTF-8")));
Added: lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java?view=auto&rev=464654
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java Mon Oct 16 13:38:57 2006
@@ -0,0 +1,141 @@
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tools.compat;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Random;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapFileOutputFormat;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.mapred.SequenceFileInputFormat;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.ToolBase;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.CrawlDb;
+import org.apache.nutch.crawl.MapWritable;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.NutchJob;
+
+/**
+ * This tool converts CrawlDb created in old <UTF8, CrawlDatum> format
+ * (Nutch versions < 0.9.0) to the new <Text, CrawlDatum> format.
+ * Optionally {@link org.apache.nutch.crawl.CrawlDatum#metaData} can be converted
+ * too from using UTF8 keys to using Text keys.
+ *
+ * @author Andrzej Bialecki
+ */
+public class CrawlDbConverter extends ToolBase implements Mapper {
+ private static final Log LOG = LogFactory.getLog(CrawlDbConverter.class);
+
+ private static final String CONVERT_META_KEY = "db.converter.with.metadata";
+
+ private boolean withMetadata;
+ private Text newKey;
+
+ public void configure(JobConf job) {
+ setConf(job);
+ withMetadata = job.getBoolean(CONVERT_META_KEY, false);
+ newKey = new Text();
+ }
+
+ public void map(WritableComparable key, Writable value, OutputCollector output,
+ Reporter reporter) throws IOException {
+ newKey.set(key.toString());
+ if (withMetadata) {
+ CrawlDatum datum = (CrawlDatum)value;
+ MapWritable meta = datum.getMetaData();
+ if (meta.size() > 0) {
+ MapWritable newMeta = new MapWritable();
+ Iterator it = meta.keySet().iterator();
+ while (it.hasNext()) {
+ WritableComparable k = (WritableComparable)it.next();
+ Writable v = meta.get(k);
+ if (k instanceof UTF8) {
+ Text t = new Text(k.toString());
+ k = t;
+ }
+ newMeta.put(k, v);
+ }
+ datum.setMetaData(newMeta);
+ }
+ }
+ output.collect(newKey, value);
+ }
+
+ public void close() throws IOException {
+ }
+
+ /**
+ * @param args
+ */
+ public static void main(String[] args) throws Exception {
+ int res = new CrawlDbConverter().doMain(NutchConfiguration.create(), args);
+ }
+
+ public int run(String[] args) throws Exception {
+ if (args.length == 0) {
+ System.err.println("Usage: CrawlDbConverter <oldDb> <newDb> [-withMetadata]");
+ System.err.println("\toldDb\tname of the crawldb that uses UTF8 class.");
+ System.err.println("\tnewDb\tname of the crawldb that will use Text class.");
+ System.err.println("\twithMetadata\tconvert also all metadata keys using UTF8 to Text.");
+ return -1;
+ }
+ JobConf job = new NutchJob(getConf());
+ FileSystem fs = FileSystem.get(getConf());
+ Path oldDb = new Path(args[0], CrawlDatum.DB_DIR_NAME);
+ Path newDb =
+ new Path(oldDb,
+ Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
+ if (!fs.exists(oldDb)) {
+ LOG.fatal("Old db doesn't exist in '" + args[0] + "'");
+ return -1;
+ }
+ boolean withMetadata = false;
+ if (args.length > 2 && args[2].equalsIgnoreCase("-withMetadata"))
+ withMetadata = true;
+
+ job.setBoolean(CONVERT_META_KEY, withMetadata);
+ job.setInputPath(oldDb);
+ job.setInputFormat(SequenceFileInputFormat.class);
+ job.setMapperClass(CrawlDbConverter.class);
+ job.setOutputFormat(MapFileOutputFormat.class);
+ job.setOutputKeyClass(Text.class);
+ job.setOutputValueClass(CrawlDatum.class);
+ job.setOutputPath(newDb);
+ try {
+ JobClient.runJob(job);
+ CrawlDb.install(job, new Path(args[1]));
+ return 0;
+ } catch (Exception e) {
+ LOG.fatal("Error: " + StringUtils.stringifyException(e));
+ return -1;
+ }
+ }
+}
Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/tools/compat/CrawlDbConverter.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java Mon Oct 16 13:38:57 2006
@@ -24,7 +24,7 @@
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
@@ -49,7 +49,7 @@
private Configuration conf;
- public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
+ public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
Metadata metadata = parse.getData().getParseMeta();
Modified: lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java Mon Oct 16 13:38:57 2006
@@ -19,6 +19,8 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
+import org.apache.lucene.document.DateField;
+import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@@ -26,7 +28,7 @@
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
@@ -43,7 +45,7 @@
private int MAX_TITLE_LENGTH;
private Configuration conf;
- public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
+ public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
String host = null;
@@ -87,6 +89,11 @@
}
// add title indexed and stored so that it can be displayed
doc.add(new Field("title", title, Field.Store.YES, Field.Index.TOKENIZED));
+
+ // add timestamp when fetched, for deduplication
+ doc.add(new Field("tstamp",
+ DateTools.timeToString(datum.getFetchTime(), DateTools.Resolution.MILLISECOND),
+ Field.Store.YES, Field.Index.NO));
return doc;
}
Modified: lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java Mon Oct 16 13:38:57 2006
@@ -48,7 +48,7 @@
import org.apache.nutch.util.mime.MimeTypeException;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import java.text.ParseException;
import java.text.SimpleDateFormat;
@@ -81,7 +81,7 @@
/** Get the MimeTypes resolver instance. */
private MimeTypes MIME;
- public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
+ public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
String url_s = url.toString();
Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIdentifier.java Mon Oct 16 13:38:57 2006
@@ -37,7 +37,7 @@
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
// Nutch imports
import org.apache.nutch.analysis.lang.NGramProfile.NGramEntry;
@@ -344,7 +344,7 @@
Protocol protocol;
try {
protocol = new ProtocolFactory(conf).getProtocol(url);
- Content content = protocol.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent();
+ Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();
Parse parse = new ParseUtil(conf).parse(content);
System.out.println("text:" + parse.getText());
return parse.getText();
Modified: lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/languageidentifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java Mon Oct 16 13:38:57 2006
@@ -21,7 +21,7 @@
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.Response;
@@ -64,7 +64,7 @@
}
// Inherited JavaDoc
- public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
+ public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
// check if LANGUAGE found, possibly put there by HTMLLanguageParser
Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Mon Oct 16 13:38:57 2006
@@ -40,7 +40,7 @@
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
/**
@@ -170,7 +170,7 @@
- public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
+ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
String urlString = url.toString();
try {
@@ -509,7 +509,7 @@
// LOGGER.setLevel(Level.FINE);
// }
- ProtocolOutput out = http.getProtocolOutput(new UTF8(url), new CrawlDatum());
+ ProtocolOutput out = http.getProtocolOutput(new Text(url), new CrawlDatum());
Content content = out.getContent();
System.out.println("Status: " + out.getStatus());
Added: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.0.0.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.0.0.jar?view=auto&rev=464654
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/lib/lucene-analyzers-2.0.0.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/lib-lucene-analyzers/plugin.xml Mon Oct 16 13:38:57 2006
@@ -10,11 +10,11 @@
<plugin
id="lib-lucene-analyzers"
name="Lucene Analysers"
- version="1.9-rc1-dev"
+ version="2.0.0"
provider-name="org.apache.lucene">
<runtime>
- <library name="lucene-analyzers-1.9-rc1-dev.jar">
+ <library name="lucene-analyzers-2.0.0.jar">
<export name="*"/>
</library>
</runtime>
Modified: lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java Mon Oct 16 13:38:57 2006
@@ -21,7 +21,7 @@
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.parse.Parse;
// Hadoop imports
@@ -47,7 +47,7 @@
// Inherited JavaDoc
- public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks)
+ public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks)
throws IndexingException {
// Check if some Rel-Tags found, possibly put there by RelTagParser
Modified: lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java Mon Oct 16 13:38:57 2006
@@ -27,7 +27,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import junit.framework.TestCase;
@@ -82,7 +82,7 @@
// get nutch content
Protocol protocol = new ProtocolFactory(NutchConfiguration.create()).getProtocol(urlString);
- content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
+ content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
protocol = null;
}
Modified: lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mp3/src/test/org/apache/nutch/parse/mp3/TestMP3Parser.java Mon Oct 16 13:38:57 2006
@@ -17,7 +17,7 @@
package org.apache.nutch.parse.mp3;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;
@@ -68,7 +68,7 @@
Configuration conf = NutchConfiguration.create();
urlString = "file:" + sampleDir + fileSeparator + id3v2;
protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
+ content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
.getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content);
Metadata metadata = parse.getData().getParseMeta();
@@ -100,7 +100,7 @@
Configuration conf = NutchConfiguration.create();
urlString = "file:" + sampleDir + fileSeparator + id3v1;
protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
+ content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
.getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content);
@@ -127,7 +127,7 @@
Configuration conf = NutchConfiguration.create();
urlString = "file:" + sampleDir + fileSeparator + none;
protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
+ content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
.getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-mp3", content);
// Metadata metadata = parse.getData().getParseMeta();
Modified: lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msexcel/src/test/org/apache/nutch/parse/msexcel/TestMSExcelParser.java Mon Oct 16 13:38:57 2006
@@ -20,7 +20,7 @@
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
/**
@@ -61,7 +61,7 @@
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = factory.getProtocol(urlString);
- content = protocol.getProtocolOutput(new UTF8(urlString),
+ content = protocol.getProtocolOutput(new Text(urlString),
new CrawlDatum()).getContent();
parse = parser.parseByExtensionId("parse-msexcel", content);
Modified: lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-mspowerpoint/src/test/org/apache/nutch/parse/mspowerpoint/TestMSPowerPointParser.java Mon Oct 16 13:38:57 2006
@@ -37,7 +37,7 @@
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
/**
@@ -107,7 +107,7 @@
System.out.println("Testing file: " + this.urlString + "...");
this.protocol =new ProtocolFactory(NutchConfiguration.create()).getProtocol(this.urlString);
- this.content = this.protocol.getProtocolOutput(new UTF8(this.urlString), new CrawlDatum()).getContent();
+ this.content = this.protocol.getProtocolOutput(new Text(this.urlString), new CrawlDatum()).getContent();
}
/**
Modified: lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-msword/src/test/org/apache/nutch/parse/msword/TestMSWordParser.java Mon Oct 16 13:38:57 2006
@@ -27,7 +27,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import junit.framework.TestCase;
@@ -68,7 +68,7 @@
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
+ content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-msword", content);
assertTrue(parse.getText().startsWith(expectedText));
Modified: lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/apache/nutch/parse/oo/TestOOParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/apache/nutch/parse/oo/TestOOParser.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/apache/nutch/parse/oo/TestOOParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-oo/src/test/org/apache/nutch/parse/oo/TestOOParser.java Mon Oct 16 13:38:57 2006
@@ -21,7 +21,7 @@
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.protocol.*;
import org.apache.nutch.parse.Parse;
@@ -87,7 +87,7 @@
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = factory.getProtocol(urlString);
- content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
+ content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = parser.getParse(content);
Modified: lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-pdf/src/test/org/apache/nutch/parse/pdf/TestPdfParser.java Mon Oct 16 13:38:57 2006
@@ -27,7 +27,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import junit.framework.TestCase;
@@ -68,7 +68,7 @@
Configuration conf = NutchConfiguration.create();
protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
+ content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-pdf", content);
int index = parse.getText().indexOf(expectedText);
Modified: lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/java/org/apache/nutch/parse/rss/RSSParser.java Mon Oct 16 13:38:57 2006
@@ -27,7 +27,7 @@
import org.apache.commons.logging.LogFactory;
// Hadoop imports
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.hadoop.conf.Configuration;
// Nutch imports
@@ -216,7 +216,7 @@
RSSParser parser = new RSSParser();
parser.setConf(conf);
Protocol protocol = new ProtocolFactory(conf).getProtocol(url);
- Content content = protocol.getProtocolOutput(new UTF8(url), new CrawlDatum()).getContent();
+ Content content = protocol.getProtocolOutput(new Text(url), new CrawlDatum()).getContent();
Parse parse = parser.getParse(content);
System.out.println("data: "+ parse.getData());
System.out.println("text: "+parse.getText());
Modified: lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rss/src/test/org/apache/nutch/parse/rss/TestRSSParser.java Mon Oct 16 13:38:57 2006
@@ -29,7 +29,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import junit.framework.TestCase;
@@ -86,7 +86,7 @@
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
+ content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-rss", content);
//check that there are 3 outlinks:
Modified: lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-rtf/src/test/org/apache/nutch/parse/rtf/TestRTFParser.java Mon Oct 16 13:38:57 2006
@@ -34,7 +34,7 @@
// Hadoop imports
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
/**
@@ -72,7 +72,7 @@
Configuration conf = NutchConfiguration.create();
urlString = "file:" + sampleDir + fileSeparator + rtfFile;
protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum())
+ content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
.getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-rtf", content);
String text = parse.getText();
Modified: lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java Mon Oct 16 13:38:57 2006
@@ -20,7 +20,7 @@
import java.io.InputStreamReader;
import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.Content;
@@ -85,7 +85,7 @@
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
+ content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parse(content);
Modified: lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java (original)
+++ lucene/nutch/trunk/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java Mon Oct 16 13:38:57 2006
@@ -27,7 +27,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import junit.framework.TestCase;
@@ -68,7 +68,7 @@
urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
+ content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
parse = new ParseUtil(conf).parseByExtensionId("parse-zip",content);
assertTrue(parse.getText().equals(expectedText));
}
Modified: lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java Mon Oct 16 13:38:57 2006
@@ -20,7 +20,7 @@
import org.apache.commons.logging.LogFactory;
import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
@@ -67,7 +67,7 @@
/** Set the point at which content is truncated. */
public void setMaxContentLength(int length) {maxContentLength = length;}
- public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
+ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
String urlString = url.toString();
try {
URL u = new URL(urlString);
@@ -141,7 +141,7 @@
// set log level
//LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
- Content content = file.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
+ Content content = file.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
System.err.println("Content-Type: " + content.getContentType());
System.err.println("Content-Length: " +
Modified: lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java (original)
+++ lucene/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java Mon Oct 16 13:38:57 2006
@@ -22,7 +22,7 @@
import org.apache.commons.net.ftp.FTPFileEntryParser;
import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
@@ -111,7 +111,7 @@
this.keepConnection = keepConnection;
}
- public ProtocolOutput getProtocolOutput(UTF8 url, CrawlDatum datum) {
+ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
String urlString = url.toString();
try {
URL u = new URL(urlString);
@@ -207,7 +207,7 @@
// set log level
//LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
- Content content = ftp.getProtocolOutput(new UTF8(urlString), new CrawlDatum()).getContent();
+ Content content = ftp.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
System.err.println("Content-Type: " + content.getContentType());
System.err.println("Content-Length: " +
Modified: lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java Mon Oct 16 13:38:57 2006
@@ -25,7 +25,7 @@
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.lucene.document.Document;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
@@ -73,23 +73,23 @@
}
/** Set to the value defined in config, 1.0f by default. */
- public void injectedScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
+ public void injectedScore(Text url, CrawlDatum datum) throws ScoringFilterException {
datum.setScore(scoreInjected);
}
/** Set to 0.0f (unknown value) - inlink contributions will bring it to
* a correct level. Newly discovered pages have at least one inlink. */
- public void initialScore(UTF8 url, CrawlDatum datum) throws ScoringFilterException {
+ public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException {
datum.setScore(0.0f);
}
/** Use {@link CrawlDatum#getScore()}. */
- public float generatorSortValue(UTF8 url, CrawlDatum datum, float initSort) throws ScoringFilterException {
+ public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException {
return datum.getScore();
}
/** Increase the score by a sum of inlinked scores. */
- public void updateDbScore(UTF8 url, CrawlDatum old, CrawlDatum datum, List inlinked) throws ScoringFilterException {
+ public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List inlinked) throws ScoringFilterException {
float adjust = 0.0f;
for (int i = 0; i < inlinked.size(); i++) {
CrawlDatum linked = (CrawlDatum)inlinked.get(i);
@@ -100,17 +100,17 @@
}
/** Store a float value of CrawlDatum.getScore() under Fetcher.SCORE_KEY. */
- public void passScoreBeforeParsing(UTF8 url, CrawlDatum datum, Content content) {
+ public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) {
content.getMetadata().set(Fetcher.SCORE_KEY, "" + datum.getScore());
}
/** Copy the value from Content metadata under Fetcher.SCORE_KEY to parseData. */
- public void passScoreAfterParsing(UTF8 url, Content content, Parse parse) {
+ public void passScoreAfterParsing(Text url, Content content, Parse parse) {
parse.getData().getContentMeta().set(Fetcher.SCORE_KEY, content.getMetadata().get(Fetcher.SCORE_KEY));
}
/** Get a float value from Fetcher.SCORE_KEY, divide it by the number of outlinks and apply. */
- public CrawlDatum distributeScoreToOutlink(UTF8 fromUrl, UTF8 toUrl, ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount, int validCount) throws ScoringFilterException {
+ public CrawlDatum distributeScoreToOutlink(Text fromUrl, Text toUrl, ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount, int validCount) throws ScoringFilterException {
float score = scoreInjected;
String scoreString = parseData.getContentMeta().get(Fetcher.SCORE_KEY);
if (scoreString != null) {
@@ -146,7 +146,7 @@
}
/** Dampen the boost value by scorePower.*/
- public float indexerScore(UTF8 url, Document doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException {
+ public float indexerScore(Text url, Document doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException {
return (float)Math.pow(dbDatum.getScore(), scorePower);
}
}
Modified: lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java (original)
+++ lucene/nutch/trunk/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java Mon Oct 16 13:38:57 2006
@@ -17,7 +17,7 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
@@ -66,7 +66,7 @@
doc.add(new Field(FIELD_NAME, collname, Field.Store.YES, Field.Index.TOKENIZED));
}
- public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+ public Document filter(Document doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
String sUrl = url.toString();
addSubCollectionField(doc, sUrl);
return doc;
Added: lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.0.0.jar
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.0.0.jar?view=auto&rev=464654
==============================================================================
Binary file - no diff available.
Propchange: lucene/nutch/trunk/src/plugin/summary-lucene/lib/lucene-highlighter-2.0.0.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/summary-lucene/plugin.xml Mon Oct 16 13:38:57 2006
@@ -10,7 +10,7 @@
<library name="summary-lucene.jar">
<export name="*"/>
</library>
- <library name="lucene-highlighter-2.0-rc1-dev.jar"/>
+ <library name="lucene-highlighter-2.0.0.jar"/>
</runtime>
<requires>
Modified: lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java (original)
+++ lucene/nutch/trunk/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java Mon Oct 16 13:38:57 2006
@@ -158,6 +158,7 @@
}
if (curRules == EMPTY_RULES || curRules == null) {
LOG.warn("can't find rules for scope '" + scope + "', using default");
+ scopedRules.put(scope, EMPTY_RULES);
}
}
if (curRules == EMPTY_RULES || curRules == null) {
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java Mon Oct 16 13:38:57 2006
@@ -28,7 +28,7 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.mortbay.http.HttpContext;
import org.mortbay.http.SocketListener;
import org.mortbay.http.handler.ResourceHandler;
@@ -54,12 +54,12 @@
LOG.trace("* creating crawldb: " + crawldb);
Path dir = new Path(crawldb, CrawlDatum.DB_DIR_NAME);
MapFile.Writer writer = new MapFile.Writer(fs, new Path(dir, "part-00000")
- .toString(), UTF8.class, CrawlDatum.class);
+ .toString(), Text.class, CrawlDatum.class);
Iterator<URLCrawlDatum> it = init.iterator();
while (it.hasNext()) {
URLCrawlDatum row = it.next();
LOG.info("adding:" + row.url.toString());
- writer.append(new UTF8(row.url), row.datum);
+ writer.append(new Text(row.url), row.datum);
}
writer.close();
}
@@ -92,11 +92,11 @@
public static class URLCrawlDatum {
- UTF8 url;
+ Text url;
CrawlDatum datum;
- public URLCrawlDatum(UTF8 url, CrawlDatum datum) {
+ public URLCrawlDatum(Text url, CrawlDatum datum) {
this.url = url;
this.datum = datum;
}
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java Mon Oct 16 13:38:57 2006
@@ -25,7 +25,7 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.util.NutchConfiguration;
import junit.framework.TestCase;
@@ -61,12 +61,12 @@
cd1 = new CrawlDatum();
cd1.setFetchInterval(1.0f);
cd1.setFetchTime(time);
- cd1.getMetaData().put(new UTF8("name"), new UTF8("cd1"));
- cd1.getMetaData().put(new UTF8("cd1"), new UTF8("cd1"));
+ cd1.getMetaData().put(new Text("name"), new Text("cd1"));
+ cd1.getMetaData().put(new Text("cd1"), new Text("cd1"));
cd2 = new CrawlDatum();
cd2.setFetchInterval(1.0f);
cd2.setFetchTime(time + 10000);
- cd2.getMetaData().put(new UTF8("name"), new UTF8("cd2"));
+ cd2.getMetaData().put(new Text("name"), new Text("cd2"));
cd3 = new CrawlDatum();
cd3.setFetchInterval(1.0f);
cd3.setFetchTime(time + 10000);
@@ -125,11 +125,11 @@
private void createCrawlDb(FileSystem fs, Path crawldb, TreeSet init, CrawlDatum cd) throws Exception {
LOG.fine("* creating crawldb: " + crawldb);
Path dir = new Path(crawldb, CrawlDatum.DB_DIR_NAME);
- MapFile.Writer writer = new MapFile.Writer(fs, new Path(dir, "part-00000").toString(), UTF8.class, CrawlDatum.class);
+ MapFile.Writer writer = new MapFile.Writer(fs, new Path(dir, "part-00000").toString(), Text.class, CrawlDatum.class);
Iterator it = init.iterator();
while (it.hasNext()) {
String key = (String)it.next();
- writer.append(new UTF8(key), cd);
+ writer.append(new Text(key), cd);
}
writer.close();
}
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestGenerator.java Mon Oct 16 13:38:57 2006
@@ -24,7 +24,7 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDBTestUtil.URLCrawlDatum;
import junit.framework.TestCase;
@@ -80,7 +80,7 @@
ArrayList<URLCrawlDatum> list = new ArrayList<URLCrawlDatum>();
for(int i=0;i<=100;i++){
- list.add(new CrawlDBTestUtil.URLCrawlDatum(new UTF8("http://aaa/" + pad(i)),
+ list.add(new CrawlDBTestUtil.URLCrawlDatum(new Text("http://aaa/" + pad(i)),
new CrawlDatum(CrawlDatum.STATUS_DB_UNFETCHED, 1, i)));
}
@@ -105,7 +105,7 @@
READ:
do {
- UTF8 key=new UTF8();
+ Text key=new Text();
CrawlDatum value=new CrawlDatum();
if(!reader.next(key, value)) break READ;
l.add(new URLCrawlDatum(key, value));
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestInjector.java Mon Oct 16 13:38:57 2006
@@ -24,7 +24,7 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import junit.framework.TestCase;
@@ -111,7 +111,7 @@
READ:
do {
- UTF8 key=new UTF8();
+ Text key=new Text();
CrawlDatum value=new CrawlDatum();
if(!reader.next(key, value)) break READ;
read.add(key.toString());
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java Mon Oct 16 13:38:57 2006
@@ -26,7 +26,7 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.util.NutchConfiguration;
import junit.framework.TestCase;
@@ -122,7 +122,7 @@
String url = (String)it.next();
LOG.fine("url=" + url);
String[] vals = (String[])expected.get(url);
- Inlinks inlinks = reader.getInlinks(new UTF8(url));
+ Inlinks inlinks = reader.getInlinks(new Text(url));
// may not be null
assertNotNull(inlinks);
ArrayList links = new ArrayList();
@@ -143,7 +143,7 @@
private void createLinkDb(FileSystem fs, Path linkdb, TreeMap init) throws Exception {
LOG.fine("* creating linkdb: " + linkdb);
Path dir = new Path(linkdb, LinkDb.CURRENT_NAME);
- MapFile.Writer writer = new MapFile.Writer(fs, new Path(dir, "part-00000").toString(), UTF8.class, Inlinks.class);
+ MapFile.Writer writer = new MapFile.Writer(fs, new Path(dir, "part-00000").toString(), Text.class, Inlinks.class);
Iterator it = init.keySet().iterator();
while (it.hasNext()) {
String key = (String)it.next();
@@ -153,7 +153,7 @@
Inlink in = new Inlink(vals[i], vals[i]);
inlinks.add(in);
}
- writer.append(new UTF8(key), inlinks);
+ writer.append(new Text(key), inlinks);
}
writer.close();
}
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/crawl/TestMapWritable.java Mon Oct 16 13:38:57 2006
@@ -25,7 +25,7 @@
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.MapWritable;
@@ -39,11 +39,11 @@
MapWritable map = new MapWritable();
assertTrue(map.isEmpty());
for (int i = 0; i < 100; i++) {
- UTF8 key = new UTF8("" + i);
+ Text key = new Text("" + i);
IntWritable value = new IntWritable(i);
map.put(key, value);
assertEquals(i + 1, map.size());
- assertTrue(map.containsKey(new UTF8("" + i)));
+ assertTrue(map.containsKey(new Text("" + i)));
assertTrue(map.containsValue(new IntWritable(i)));
map.remove(key);
assertEquals(i, map.size());
@@ -64,14 +64,14 @@
map.clear();
assertTrue(map.isEmpty());
assertEquals(0, map.size());
- assertFalse(map.containsKey(new UTF8("" + 1)));
+ assertFalse(map.containsKey(new Text("" + 1)));
}
public void testWritable() throws Exception {
MapWritable datum1 = new MapWritable();
for (int i = 0; i < 100; i++) {
- datum1.put(new LongWritable(i), new UTF8("" + 1));
+ datum1.put(new LongWritable(i), new Text("" + 1));
}
assertEquals(100, datum1.size());
testWritable(datum1);
@@ -86,7 +86,7 @@
CrawlDatum c = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, 1f);
c.setMetaData(new MapWritable());
for (int i = 0; i < 100; i++) {
- c.getMetaData().put(new LongWritable(i), new UTF8("" + 1));
+ c.getMetaData().put(new LongWritable(i), new Text("" + 1));
}
testWritable(c);
}
@@ -94,10 +94,10 @@
public void testEquals() {
MapWritable map1 = new MapWritable();
MapWritable map2 = new MapWritable();
- map1.put(new UTF8("key1"), new UTF8("val1"));
- map1.put(new UTF8("key2"), new UTF8("val2"));
- map2.put(new UTF8("key2"), new UTF8("val2"));
- map2.put(new UTF8("key1"), new UTF8("val1"));
+ map1.put(new Text("key1"), new Text("val1"));
+ map1.put(new Text("key2"), new Text("val2"));
+ map2.put(new Text("key2"), new Text("val2"));
+ map2.put(new Text("key1"), new Text("val1"));
assertTrue(map1.equals(map2));
}
@@ -137,13 +137,13 @@
System.out.println("needed time for reading map's: " + needed);
fs.delete(file);
- // UTF8
+ // Text
System.out.println("start writing utf8's");
- writer = new SequenceFile.Writer(fs, file, IntWritable.class, UTF8.class);
+ writer = new SequenceFile.Writer(fs, file, IntWritable.class, Text.class);
// write map
start = System.currentTimeMillis();
key = new IntWritable();
- UTF8 value = new UTF8();
+ Text value = new Text();
String s = "15726:15726";
for (int i = 0; i < 1000000; i++) {
key.set(i);
@@ -181,9 +181,9 @@
}
public void testRecycling() throws Exception {
- UTF8 value = new UTF8("value");
- UTF8 key1 = new UTF8("a");
- UTF8 key2 = new UTF8("b");
+ Text value = new Text("value");
+ Text key1 = new Text("a");
+ Text key2 = new Text("b");
MapWritable writable = new MapWritable();
writable.put(key1, value);
Modified: lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java?view=diff&rev=464654&r1=464653&r2=464654
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Mon Oct 16 13:38:57 2006
@@ -23,7 +23,7 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
-import org.apache.hadoop.io.UTF8;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDBTestUtil;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.crawl.Injector;
@@ -108,7 +108,7 @@
READ:
do {
- UTF8 key=new UTF8();
+ Text key=new Text();
Content value=new Content();
if(!reader.next(key, value)) break READ;
String contentString=new String(value.getContent());
Added: lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java?view=auto&rev=464654
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java (added)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/indexer/TestDeleteDuplicates.java Mon Oct 16 13:38:57 2006
@@ -0,0 +1,154 @@
+package org.apache.nutch.indexer;
+
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.MD5Hash;
+import org.apache.lucene.document.DateTools;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.DateTools.Resolution;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.nutch.analysis.NutchDocumentAnalyzer;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.util.NutchConfiguration;
+
+import junit.framework.TestCase;
+
+public class TestDeleteDuplicates extends TestCase {
+ Configuration conf;
+ FileSystem fs;
+ Path root;
+ Path index1;
+ Path index2;
+
+ public void setUp() throws Exception {
+ conf = NutchConfiguration.create();
+ conf.set("fs.default.name", "local");
+ fs = FileSystem.get(conf);
+ root = new Path("dedup2-test-" + new Random().nextInt());
+ // create test indexes
+ index1 = createIndex("index1", true, 1.0f, 10L);
+ index2 = createIndex("index2", false, 2.0f, 20L);
+ }
+
+ private Path createIndex(String name, boolean hashDup, float inc, long time) throws Exception {
+ Path idx = new Path(root, name);
+ Path sub = new Path(idx, "part-0000");
+ Directory dir = FSDirectory.getDirectory(sub.toString(), true);
+ IndexWriter writer = new IndexWriter(dir, new NutchDocumentAnalyzer(conf), true);
+ Document doc = makeDoc(name,
+ MD5Hash.digest("1").toString(),
+ "http://www.example.com/1",
+ 1.0f, time);
+ writer.addDocument(doc);
+ if (hashDup) {
+ doc = makeDoc(name,
+ MD5Hash.digest("1").toString(),
+ "http://www.example.com/2",
+ 1.0f + inc, time + 1);
+ } else {
+ doc = makeDoc(name,
+ MD5Hash.digest("2").toString(),
+ "http://www.example.com/1",
+ 1.0f + inc, time + 1);
+ }
+ writer.addDocument(doc);
+ writer.close();
+ return idx;
+ }
+
+ private Document makeDoc(String segment, String digest, String url, float boost, long time) {
+ Document doc = new Document();
+ doc.add(new Field("segment", segment, Field.Store.YES, Field.Index.NO));
+ doc.add(new Field("digest", digest, Field.Store.YES, Field.Index.NO));
+ doc.add(new Field("url", url, Field.Store.YES, Field.Index.TOKENIZED));
+ doc.setBoost(boost);
+ doc.add(new Field("boost", "" + boost, Field.Store.YES, Field.Index.NO));
+ doc.add(new Field("tstamp", DateTools.timeToString(time, Resolution.MILLISECOND), Field.Store.YES, Field.Index.NO));
+ return doc;
+ }
+
+ public void tearDown() throws Exception {
+ fs.delete(root);
+ }
+
+ public void testHashDuplicates() throws Exception {
+ DeleteDuplicates dedup = new DeleteDuplicates(conf);
+ dedup.dedup(new Path[]{index1});
+ FsDirectory dir = new FsDirectory(fs, new Path(index1, "part-0000"), false, conf);
+ IndexReader reader = IndexReader.open(dir);
+ assertEquals("only one doc left", reader.numDocs(), 1);
+ for (int i = 0; i < reader.maxDoc(); i++) {
+ if (reader.isDeleted(i)) {
+ System.out.println("-doc " + i + " deleted");
+ continue;
+ }
+ Document doc = reader.document(i);
+ // make sure we got the right one
+ assertEquals("check url", "http://www.example.com/2", doc.get("url"));
+ System.out.println(doc);
+ }
+ reader.close();
+ }
+
+ public void testUrlDuplicates() throws Exception {
+ DeleteDuplicates dedup = new DeleteDuplicates(conf);
+ dedup.dedup(new Path[]{index2});
+ FsDirectory dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf);
+ IndexReader reader = IndexReader.open(dir);
+ assertEquals("only one doc left", reader.numDocs(), 1);
+ MD5Hash hash = MD5Hash.digest("2");
+ for (int i = 0; i < reader.maxDoc(); i++) {
+ if (reader.isDeleted(i)) {
+ System.out.println("-doc " + i + " deleted");
+ continue;
+ }
+ Document doc = reader.document(i);
+ // make sure we got the right one
+ assertEquals("check hash", hash.toString(), doc.get("digest"));
+ System.out.println(doc);
+ }
+ reader.close();
+ }
+
+ public void testMixedDuplicates() throws Exception {
+ DeleteDuplicates dedup = new DeleteDuplicates(conf);
+ dedup.dedup(new Path[]{index1, index2});
+ FsDirectory dir = new FsDirectory(fs, new Path(index1, "part-0000"), false, conf);
+ IndexReader reader = IndexReader.open(dir);
+ assertEquals("only one doc left", reader.numDocs(), 1);
+ for (int i = 0; i < reader.maxDoc(); i++) {
+ if (reader.isDeleted(i)) {
+ System.out.println("-doc " + i + " deleted");
+ continue;
+ }
+ Document doc = reader.document(i);
+ // make sure we got the right one
+ assertEquals("check url", "http://www.example.com/2", doc.get("url"));
+ System.out.println(doc);
+ }
+ reader.close();
+ dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf);
+ reader = IndexReader.open(dir);
+ assertEquals("only one doc left", reader.numDocs(), 1);
+ MD5Hash hash = MD5Hash.digest("2");
+ for (int i = 0; i < reader.maxDoc(); i++) {
+ if (reader.isDeleted(i)) {
+ System.out.println("-doc " + i + " deleted");
+ continue;
+ }
+ Document doc = reader.document(i);
+ // make sure we got the right one
+ assertEquals("check hash", hash.toString(), doc.get("digest"));
+ System.out.println(doc);
+ }
+ reader.close();
+ }
+
+}