You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by cu...@apache.org on 2008/03/20 04:33:24 UTC
svn commit: r639138 [2/3] - in /hadoop/core/trunk: ./ src/contrib/index/
src/contrib/index/conf/ src/contrib/index/lib/ src/contrib/index/sample/
src/contrib/index/src/ src/contrib/index/src/java/
src/contrib/index/src/java/org/ src/contrib/index/src/j...
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/main/UpdateIndex.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/main/UpdateIndex.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/main/UpdateIndex.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/main/UpdateIndex.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,273 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.main;
+
+import java.io.IOException;
+import java.text.NumberFormat;
+import java.util.Arrays;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.contrib.index.mapred.IndexUpdateConfiguration;
+import org.apache.hadoop.contrib.index.mapred.IIndexUpdater;
+import org.apache.hadoop.contrib.index.mapred.Shard;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.util.ReflectionUtils;
+
+/**
+ * A distributed "index" is partitioned into "shards". Each shard corresponds
+ * to a Lucene instance. This class contains the main() method which uses a
+ * Map/Reduce job to analyze documents and update Lucene instances in parallel.
+ *
+ * The main() method in UpdateIndex requires the following information for
+ * updating the shards:
+ * - Input formatter. This specifies how to format the input documents.
+ * - Analysis. This defines the analyzer to use on the input. The analyzer
+ * determines whether a document is being inserted, updated, or deleted.
+ * For inserts or updates, the analyzer also converts each input document
+ * into a Lucene document.
+ * - Input paths. This provides the location(s) of updated documents,
+ * e.g., HDFS files or directories, or HBase tables.
+ * - Shard paths, or index path with the number of shards. Either specify
+ * the path for each shard, or specify an index path and the shards are
+ * the sub-directories of the index directory.
+ * - Output path. When the update to a shard is done, a message is put here.
+ * - Number of map tasks.
+ *
+ * All of the information can be specified in a configuration file. All but
+ * the first two can also be specified as command line options. Check out
+ * conf/index-config.xml.template for other configurable parameters.
+ *
+ * Note: Because of the parallel nature of Map/Reduce, the behaviour of
+ * multiple inserts, deletes or updates to the same document is undefined.
+ */
+public class UpdateIndex {
+ public static final Log LOG = LogFactory.getLog(UpdateIndex.class);
+
+ private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
+ static {
+ NUMBER_FORMAT.setMinimumIntegerDigits(5);
+ NUMBER_FORMAT.setGroupingUsed(false);
+ }
+
+ private static long now() {
+ return System.currentTimeMillis();
+ }
+
+ private static void printUsage(String cmd) {
+ System.err.println("Usage: java " + UpdateIndex.class.getName() + "\n"
+ + " -inputPaths <inputPath,inputPath>\n"
+ + " -outputPath <outputPath>\n"
+ + " -shards <shardDir,shardDir>\n"
+ + " -indexPath <indexPath>\n"
+ + " -numShards <num>\n"
+ + " -numMapTasks <num>\n"
+ + " -conf <confPath>\n"
+ + "Note: Do not use both -shards option and -indexPath option.");
+ }
+
+ private static String getIndexPath(Configuration conf) {
+ return conf.get("sea.index.path");
+ }
+
+ private static int getNumShards(Configuration conf) {
+ return conf.getInt("sea.num.shards", 1);
+ }
+
+ private static Shard[] createShards(String indexPath, int numShards,
+ Configuration conf) throws IOException {
+
+ String parent = Shard.normalizePath(indexPath) + Path.SEPARATOR;
+ long versionNumber = -1;
+ long generation = -1;
+
+ FileSystem fs = FileSystem.get(conf);
+ Path path = new Path(indexPath);
+
+ if (fs.exists(path)) {
+ FileStatus[] fileStatus = fs.listStatus(path);
+ String[] shardNames = new String[fileStatus.length];
+ int count = 0;
+ for (int i = 0; i < fileStatus.length; i++) {
+ if (fileStatus[i].isDir()) {
+ shardNames[count] = fileStatus[i].getPath().getName();
+ count++;
+ }
+ }
+ Arrays.sort(shardNames, 0, count);
+
+ Shard[] shards = new Shard[count >= numShards ? count : numShards];
+ for (int i = 0; i < count; i++) {
+ shards[i] =
+ new Shard(versionNumber, parent + shardNames[i], generation);
+ }
+
+ int number = count;
+ for (int i = count; i < numShards; i++) {
+ String shardPath;
+ while (true) {
+ shardPath = parent + NUMBER_FORMAT.format(number++);
+ if (!fs.exists(new Path(shardPath))) {
+ break;
+ }
+ }
+ shards[i] = new Shard(versionNumber, shardPath, generation);
+ }
+ return shards;
+ } else {
+ Shard[] shards = new Shard[numShards];
+ for (int i = 0; i < shards.length; i++) {
+ shards[i] =
+ new Shard(versionNumber, parent + NUMBER_FORMAT.format(i),
+ generation);
+ }
+ return shards;
+ }
+ }
+
+ /**
+ * The main() method
+ * @param argv
+ */
+ public static void main(String[] argv) {
+ if (argv.length == 0) {
+ printUsage("");
+ System.exit(-1);
+ }
+
+ String inputPathsString = null;
+ Path outputPath = null;
+ String shardsString = null;
+ String indexPath = null;
+ int numShards = -1;
+ int numMapTasks = -1;
+ Configuration conf = new Configuration();
+ String confPath = null;
+
+ // parse the command line
+ for (int i = 0; i < argv.length; i++) { // parse command line
+ if (argv[i].equals("-inputPaths")) {
+ inputPathsString = argv[++i];
+ } else if (argv[i].equals("-outputPath")) {
+ outputPath = new Path(argv[++i]);
+ } else if (argv[i].equals("-shards")) {
+ shardsString = argv[++i];
+ } else if (argv[i].equals("-indexPath")) {
+ indexPath = argv[++i];
+ } else if (argv[i].equals("-numShards")) {
+ numShards = Integer.parseInt(argv[++i]);
+ } else if (argv[i].equals("-numMapTasks")) {
+ numMapTasks = Integer.parseInt(argv[++i]);
+ } else if (argv[i].equals("-conf")) {
+ // add as a local FS resource
+ confPath = argv[++i];
+ conf.addResource(new Path(confPath));
+ } else {
+ System.out.println("Unknown option " + argv[i] + " w/ value "
+ + argv[++i]);
+ }
+ }
+ LOG.info("inputPaths = " + inputPathsString);
+ LOG.info("outputPath = " + outputPath);
+ LOG.info("shards = " + shardsString);
+ LOG.info("indexPath = " + indexPath);
+ LOG.info("numShards = " + numShards);
+ LOG.info("numMapTasks= " + numMapTasks);
+ LOG.info("confPath = " + confPath);
+
+ Path[] inputPaths = null;
+ Shard[] shards = null;
+
+ JobConf jobConf = new JobConf(conf);
+ IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(jobConf);
+
+ if (inputPathsString != null) {
+ jobConf.set("mapred.input.dir", inputPathsString);
+ }
+ inputPaths = jobConf.getInputPaths();
+ if (inputPaths.length == 0) {
+ inputPaths = null;
+ }
+
+ if (outputPath == null) {
+ outputPath = jobConf.getOutputPath();
+ }
+
+ if (inputPaths == null || outputPath == null) {
+ System.err.println("InputPaths and outputPath must be specified.");
+ printUsage("");
+ System.exit(-1);
+ }
+
+ if (shardsString != null) {
+ iconf.setIndexShards(shardsString);
+ }
+ shards = Shard.getIndexShards(iconf);
+ if (shards != null && shards.length == 0) {
+ shards = null;
+ }
+
+ if (indexPath == null) {
+ indexPath = getIndexPath(conf);
+ }
+ if (numShards <= 0) {
+ numShards = getNumShards(conf);
+ }
+
+ if (shards == null && indexPath == null) {
+ System.err.println("Either shards or indexPath must be specified.");
+ printUsage("");
+ System.exit(-1);
+ }
+
+ if (numMapTasks <= 0) {
+ numMapTasks = jobConf.getNumMapTasks();
+ }
+
+ try {
+ // create shards and set their directories if necessary
+ if (shards == null) {
+ shards = createShards(indexPath, numShards, conf);
+ }
+
+ long startTime = now();
+ try {
+ IIndexUpdater updater =
+ (IIndexUpdater) ReflectionUtils.newInstance(
+ iconf.getIndexUpdaterClass(), conf);
+ LOG.info("sea.index.updater = "
+ + iconf.getIndexUpdaterClass().getName());
+
+ updater.run(conf, inputPaths, outputPath, numMapTasks, shards);
+ LOG.info("Index update job is done");
+
+ } finally {
+ long elapsedTime = now() - startTime;
+ LOG.info("Elapsed time is " + (elapsedTime / 1000) + "s");
+ System.out.println("Elapsed time is " + (elapsedTime / 1000) + "s");
+ }
+ } catch (Exception e) {
+ e.printStackTrace(System.err);
+ }
+ }
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/main/UpdateIndex.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/DocumentAndOp.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/DocumentAndOp.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/DocumentAndOp.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/DocumentAndOp.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,208 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.mapred;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.Term;
+
+/**
+ * This class represents an indexing operation. The operation can be an insert,
+ * a delete or an update. If the operation is an insert or an update, a (new)
+ * document must be specified. If the operation is a delete or an update, a
+ * delete term must be specified.
+ */
+public class DocumentAndOp implements Writable {
+
+ /**
+ * This class represents the type of an operation - an insert, a delete or
+ * an update.
+ */
+ public static final class Op {
+ public static final Op INSERT = new Op("INSERT");
+ public static final Op DELETE = new Op("DELETE");
+ public static final Op UPDATE = new Op("UPDATE");
+
+ private String name;
+
+ private Op(String name) {
+ this.name = name;
+ }
+
+ public String toString() {
+ return name;
+ }
+ }
+
+ private Op op;
+ private Document doc;
+ private Term term;
+
+ /**
+ * Constructor for no operation.
+ */
+ public DocumentAndOp() {
+ }
+
+ /**
+ * Constructor for an insert operation.
+ * @param op
+ * @param doc
+ */
+ public DocumentAndOp(Op op, Document doc) {
+ assert (op == Op.INSERT);
+ this.op = op;
+ this.doc = doc;
+ this.term = null;
+ }
+
+ /**
+ * Constructor for a delete operation.
+ * @param op
+ * @param term
+ */
+ public DocumentAndOp(Op op, Term term) {
+ assert (op == Op.DELETE);
+ this.op = op;
+ this.doc = null;
+ this.term = term;
+ }
+
+ /**
+ * Constructor for an insert, a delete or an update operation.
+ * @param op
+ * @param doc
+ * @param term
+ */
+ public DocumentAndOp(Op op, Document doc, Term term) {
+ if (op == Op.INSERT) {
+ assert (doc != null);
+ assert (term == null);
+ } else if (op == Op.DELETE) {
+ assert (doc == null);
+ assert (term != null);
+ } else {
+ assert (op == Op.UPDATE);
+ assert (doc != null);
+ assert (term != null);
+ }
+ this.op = op;
+ this.doc = doc;
+ this.term = term;
+ }
+
+ /**
+ * Set the instance to be an insert operation.
+ * @param doc
+ */
+ public void setInsert(Document doc) {
+ this.op = Op.INSERT;
+ this.doc = doc;
+ this.term = null;
+ }
+
+ /**
+ * Set the instance to be a delete operation.
+ * @param term
+ */
+ public void setDelete(Term term) {
+ this.op = Op.DELETE;
+ this.doc = null;
+ this.term = term;
+ }
+
+ /**
+ * Set the instance to be an update operation.
+ * @param doc
+ * @param term
+ */
+ public void setUpdate(Document doc, Term term) {
+ this.op = Op.UPDATE;
+ this.doc = doc;
+ this.term = term;
+ }
+
+ /**
+ * Get the type of operation.
+ * @return the type of the operation.
+ */
+ public Op getOp() {
+ return op;
+ }
+
+ /**
+ * Get the document.
+ * @return the document
+ */
+ public Document getDocument() {
+ return doc;
+ }
+
+ /**
+ * Get the term.
+ * @return the term
+ */
+ public Term getTerm() {
+ return term;
+ }
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#toString()
+ */
+ public String toString() {
+ StringBuilder buffer = new StringBuilder();
+ buffer.append(this.getClass().getName());
+ buffer.append("[op=");
+ buffer.append(op);
+ buffer.append(", doc=");
+ if (doc != null) {
+ buffer.append(doc);
+ } else {
+ buffer.append("null");
+ }
+ buffer.append(", term=");
+ if (term != null) {
+ buffer.append(term);
+ } else {
+ buffer.append("null");
+ }
+ buffer.append("]");
+ return buffer.toString();
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
+ */
+ public void write(DataOutput out) throws IOException {
+ throw new IOException(this.getClass().getName()
+ + ".write should never be called");
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
+ */
+ public void readFields(DataInput in) throws IOException {
+ throw new IOException(this.getClass().getName()
+ + ".readFields should never be called");
+ }
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/DocumentAndOp.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/DocumentID.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/DocumentID.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/DocumentID.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/DocumentID.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.mapred;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.WritableComparable;
+
+/**
+ * The class represents a document id, which is of type text.
+ */
+public class DocumentID implements WritableComparable {
+ private final Text docID;
+
+ /**
+ * Constructor.
+ */
+ public DocumentID() {
+ docID = new Text();
+ }
+
+ /**
+ * The text of the document id.
+ * @return the text
+ */
+ public Text getText() {
+ return docID;
+ }
+
+ /* (non-Javadoc)
+ * @see java.lang.Comparable#compareTo(java.lang.Object)
+ */
+ public int compareTo(Object obj) {
+ if (this == obj) {
+ return 0;
+ } else {
+ return docID.compareTo(((DocumentID) obj).docID);
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#hashCode()
+ */
+ public int hashCode() {
+ return docID.hashCode();
+ }
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#toString()
+ */
+ public String toString() {
+ return this.getClass().getName() + "[" + docID + "]";
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
+ */
+ public void write(DataOutput out) throws IOException {
+ throw new IOException(this.getClass().getName()
+ + ".write should never be called");
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
+ */
+ public void readFields(DataInput in) throws IOException {
+ throw new IOException(this.getClass().getName()
+ + ".readFields should never be called");
+ }
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/DocumentID.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IDistributionPolicy.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IDistributionPolicy.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IDistributionPolicy.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IDistributionPolicy.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.mapred;
+
+/**
+ * A distribution policy decides, given a document with a document id, which
+ * one shard the request should be sent to if the request is an insert, and
+ * which shard(s) the request should be sent to if the request is a delete.
+ */
+public interface IDistributionPolicy {
+
+ /**
+ * Initialization. It must be called before any chooseShard() is called.
+ * @param shards
+ */
+ void init(Shard[] shards);
+
+ /**
+ * Choose a shard to send an insert request.
+ * @param key
+ * @return the index of the chosen shard
+ */
+ int chooseShardForInsert(DocumentID key);
+
+ /**
+ * Choose a shard or all shards to send a delete request. E.g. a round-robin
+ * distribution policy would send a delete request to all the shards.
+ * -1 represents all the shards.
+ * @param key
+ * @return the index of the chosen shard, -1 if all the shards are chosen
+ */
+ int chooseShardForDelete(DocumentID key);
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IDistributionPolicy.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IIndexUpdater.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IIndexUpdater.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IIndexUpdater.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IIndexUpdater.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.mapred;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+
+/**
+ * A class implements an index updater interface should create a Map/Reduce job
+ * configuration and run the Map/Reduce job to analyze documents and update
+ * Lucene instances in parallel.
+ */
+public interface IIndexUpdater {
+
+ /**
+ * Create a Map/Reduce job configuration and run the Map/Reduce job to
+ * analyze documents and update Lucene instances in parallel.
+ * @param conf
+ * @param inputPaths
+ * @param outputPath
+ * @param numMapTasks
+ * @param shards
+ * @throws IOException
+ */
+ void run(Configuration conf, Path[] inputPaths, Path outputPath,
+ int numMapTasks, Shard[] shards) throws IOException;
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IIndexUpdater.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/ILocalAnalysis.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/ILocalAnalysis.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/ILocalAnalysis.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/ILocalAnalysis.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.mapred;
+
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.Mapper;
+
+/**
+ * Application specific local analysis. The output type must be (DocumentID,
+ * DocumentAndOp).
+ */
+public interface ILocalAnalysis<K extends WritableComparable, V extends Writable>
+ extends Mapper<K, V, DocumentID, DocumentAndOp> {
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/ILocalAnalysis.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateCombiner.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateCombiner.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateCombiner.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateCombiner.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.mapred;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+/**
+ * This combiner combines multiple intermediate forms into one intermediate
+ * form. More specifically, the input intermediate forms are a single-document
+ * ram index and/or a single delete term. An output intermediate form contains
+ * a multi-document ram index and/or multiple delete terms.
+ */
+public class IndexUpdateCombiner extends MapReduceBase implements
+ Reducer<Shard, IntermediateForm, Shard, IntermediateForm> {
+ static final Log LOG = LogFactory.getLog(IndexUpdateCombiner.class);
+
+ IndexUpdateConfiguration iconf;
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.Reducer#reduce(java.lang.Object, java.util.Iterator, org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter)
+ */
+ public void reduce(Shard key, Iterator<IntermediateForm> values,
+ OutputCollector<Shard, IntermediateForm> output, Reporter reporter)
+ throws IOException {
+
+ LOG.info("Construct a form writer for " + key);
+ IntermediateForm form = new IntermediateForm();
+ form.configure(iconf);
+ while (values.hasNext()) {
+ IntermediateForm singleDocForm = values.next();
+ form.process(singleDocForm);
+ }
+ form.closeWriter();
+ LOG.info("Closed the form writer for " + key + ", form = " + form);
+
+ output.collect(key, form);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.MapReduceBase#configure(org.apache.hadoop.mapred.JobConf)
+ */
+ public void configure(JobConf job) {
+ iconf = new IndexUpdateConfiguration(job);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.MapReduceBase#close()
+ */
+ public void close() throws IOException {
+ }
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateCombiner.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateConfiguration.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateConfiguration.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateConfiguration.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateConfiguration.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,238 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.mapred;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.contrib.index.example.HashingDistributionPolicy;
+import org.apache.hadoop.contrib.index.example.LineDocInputFormat;
+import org.apache.hadoop.contrib.index.example.LineDocLocalAnalysis;
+import org.apache.hadoop.mapred.InputFormat;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+
+/**
+ * This class provides the getters and the setters to a number of parameters.
+ * Most of the parameters are related to the index update and the rest are
+ * from the existing Map/Reduce parameters.
+ */
+public class IndexUpdateConfiguration {
+ final Configuration conf;
+
+ /**
+ * Constructor
+ * @param conf
+ */
+ public IndexUpdateConfiguration(Configuration conf) {
+ this.conf = conf;
+ }
+
+ /**
+ * Get the underlying configuration object.
+ * @return the configuration
+ */
+ public Configuration getConfiguration() {
+ return conf;
+ }
+
+ //
+ // existing map/reduce properties
+ //
+ // public int getIOFileBufferSize() {
+ // return getInt("io.file.buffer.size", 4096);
+ // }
+
+ /**
+ * Get the IO sort space in MB.
+ * @return the IO sort space in MB
+ */
+ public int getIOSortMB() {
+ return conf.getInt("io.sort.mb", 100);
+ }
+
+ /**
+ * Set the IO sort space in MB.
+ * @param mb the IO sort space in MB
+ */
+ public void setIOSortMB(int mb) {
+ conf.setInt("io.sort.mb", mb);
+ }
+
+ /**
+ * Get the Map/Reduce temp directory.
+ * @return the Map/Reduce temp directory
+ */
+ public String getMapredTempDir() {
+ return conf.get("mapred.temp.dir");
+ }
+
+ //
+ // properties for index update
+ //
+ /**
+ * Get the distribution policy class.
+ * @return the distribution policy class
+ */
+ public Class<? extends IDistributionPolicy> getDistributionPolicyClass() {
+ return conf.getClass("sea.distribution.policy",
+ HashingDistributionPolicy.class, IDistributionPolicy.class);
+ }
+
+ /**
+ * Set the distribution policy class.
+ * @param theClass the distribution policy class
+ */
+ public void setDistributionPolicyClass(
+ Class<? extends IDistributionPolicy> theClass) {
+ conf.setClass("sea.distribution.policy", theClass,
+ IDistributionPolicy.class);
+ }
+
+ /**
+ * Get the analyzer class.
+ * @return the analyzer class
+ */
+ public Class<? extends Analyzer> getDocumentAnalyzerClass() {
+ return conf.getClass("sea.document.analyzer", StandardAnalyzer.class,
+ Analyzer.class);
+ }
+
+ /**
+ * Set the analyzer class.
+ * @param theClass the analyzer class
+ */
+ public void setDocumentAnalyzerClass(Class<? extends Analyzer> theClass) {
+ conf.setClass("sea.document.analyzer", theClass, Analyzer.class);
+ }
+
+ /**
+ * Get the index input format class.
+ * @return the index input format class
+ */
+ public Class<? extends InputFormat> getIndexInputFormatClass() {
+ return conf.getClass("sea.input.format", LineDocInputFormat.class,
+ InputFormat.class);
+ }
+
+ /**
+ * Set the index input format class.
+ * @param theClass the index input format class
+ */
+ public void setIndexInputFormatClass(Class<? extends InputFormat> theClass) {
+ conf.setClass("sea.input.format", theClass, InputFormat.class);
+ }
+
+ /**
+ * Get the index updater class.
+ * @return the index updater class
+ */
+ public Class<? extends IIndexUpdater> getIndexUpdaterClass() {
+ return conf.getClass("sea.index.updater", IndexUpdater.class,
+ IIndexUpdater.class);
+ }
+
+ /**
+ * Set the index updater class.
+ * @param theClass the index updater class
+ */
+ public void setIndexUpdaterClass(Class<? extends IIndexUpdater> theClass) {
+ conf.setClass("sea.index.updater", theClass, IIndexUpdater.class);
+ }
+
+ /**
+ * Get the local analysis class.
+ * @return the local analysis class
+ */
+ public Class<? extends ILocalAnalysis> getLocalAnalysisClass() {
+ return conf.getClass("sea.local.analysis", LineDocLocalAnalysis.class,
+ ILocalAnalysis.class);
+ }
+
+ /**
+ * Set the local analysis class.
+ * @param theClass the local analysis class
+ */
+ public void setLocalAnalysisClass(Class<? extends ILocalAnalysis> theClass) {
+ conf.setClass("sea.local.analysis", theClass, ILocalAnalysis.class);
+ }
+
+ /**
+ * Get the string representation of a number of shards.
+ * @return the string representation of a number of shards
+ */
+ public String getIndexShards() {
+ return conf.get("sea.index.shards");
+ }
+
+ /**
+ * Set the string representation of a number of shards.
+ * @param shards the string representation of a number of shards
+ */
+ public void setIndexShards(String shards) {
+ conf.set("sea.index.shards", shards);
+ }
+
+ /**
+ * Get the max field length for a Lucene instance.
+ * @return the max field length for a Lucene instance
+ */
+ public int getIndexMaxFieldLength() {
+ return conf.getInt("sea.max.field.length", -1);
+ }
+
+ /**
+ * Set the max field length for a Lucene instance.
+ * @param maxFieldLength the max field length for a Lucene instance
+ */
+ public void setIndexMaxFieldLength(int maxFieldLength) {
+ conf.setInt("sea.max.field.length", maxFieldLength);
+ }
+
+ /**
+ * Get the max number of segments for a Lucene instance.
+ * @return the max number of segments for a Lucene instance
+ */
+ public int getIndexMaxNumSegments() {
+ return conf.getInt("sea.max.num.segments", -1);
+ }
+
+ /**
+ * Set the max number of segments for a Lucene instance.
+ * @param maxNumSegments the max number of segments for a Lucene instance
+ */
+ public void setIndexMaxNumSegments(int maxNumSegments) {
+ conf.setInt("sea.max.num.segments", maxNumSegments);
+ }
+
+ /**
+ * Check whether to use the compound file format for a Lucene instance.
+ * @return true if using the compound file format for a Lucene instance
+ */
+ public boolean getIndexUseCompoundFile() {
+ return conf.getBoolean("sea.use.compound.file", false);
+ }
+
+ /**
+ * Set whether use the compound file format for a Lucene instance.
+ * @param useCompoundFile whether to use the compound file format
+ */
+ public void setIndexUseCompoundFile(boolean useCompoundFile) {
+ conf.setBoolean("sea.use.compound.file", useCompoundFile);
+ }
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateConfiguration.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateMapper.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateMapper.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateMapper.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateMapper.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,199 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.mapred;
+
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.Mapper;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.util.ReflectionUtils;
+import org.apache.lucene.analysis.Analyzer;
+
+/**
+ * This class applies local analysis on a key-value pair and then convert the
+ * result docid-operation pair to a shard-and-intermediate form pair.
+ */
+public class IndexUpdateMapper<K extends WritableComparable, V extends Writable>
+ extends MapReduceBase implements Mapper<K, V, Shard, IntermediateForm> {
+ static final Log LOG = LogFactory.getLog(IndexUpdateMapper.class);
+
+ /**
+ * Get the map output key class.
+ * @return the map output key class
+ */
+ public static Class<? extends WritableComparable> getMapOutputKeyClass() {
+ return Shard.class;
+ }
+
+ /**
+ * Get the map output value class.
+ * @return the map output value class
+ */
+ public static Class<? extends Writable> getMapOutputValueClass() {
+ return IntermediateForm.class;
+ }
+
+ IndexUpdateConfiguration iconf;
+ private Analyzer analyzer;
+ private Shard[] shards;
+ private IDistributionPolicy distributionPolicy;
+
+ private ILocalAnalysis<K, V> localAnalysis;
+ private DocumentID tmpKey;
+ private DocumentAndOp tmpValue;
+
+ private OutputCollector<DocumentID, DocumentAndOp> tmpCollector =
+ new OutputCollector<DocumentID, DocumentAndOp>() {
+ public void collect(DocumentID key, DocumentAndOp value)
+ throws IOException {
+ tmpKey = key;
+ tmpValue = value;
+ }
+ };
+
+ /**
+ * Map a key-value pair to a shard-and-intermediate form pair. Internally,
+ * the local analysis is first applied to map the key-value pair to a
+ * document id-and-operation pair, then the docid-and-operation pair is
+ * mapped to a shard-intermediate form pair. The intermediate form is of the
+ * form of a single-document ram index and/or a single delete term.
+ */
+ public void map(K key, V value,
+ OutputCollector<Shard, IntermediateForm> output, Reporter reporter)
+ throws IOException {
+
+ synchronized (this) {
+ localAnalysis.map(key, value, tmpCollector, reporter);
+
+ if (tmpKey != null && tmpValue != null) {
+ DocumentAndOp doc = tmpValue;
+ IntermediateForm form = new IntermediateForm();
+ form.configure(iconf);
+ form.process(doc, analyzer);
+ form.closeWriter();
+
+ if (doc.getOp() == DocumentAndOp.Op.INSERT) {
+ int chosenShard = distributionPolicy.chooseShardForInsert(tmpKey);
+ if (chosenShard >= 0) {
+ // insert into one shard
+ output.collect(shards[chosenShard], form);
+ } else {
+ throw new IOException("Chosen shard for insert must be >= 0");
+ }
+
+ } else if (doc.getOp() == DocumentAndOp.Op.DELETE) {
+ int chosenShard = distributionPolicy.chooseShardForDelete(tmpKey);
+ if (chosenShard >= 0) {
+ // delete from one shard
+ output.collect(shards[chosenShard], form);
+ } else {
+ // broadcast delete to all shards
+ for (int i = 0; i < shards.length; i++) {
+ output.collect(shards[i], form);
+ }
+ }
+
+ } else { // UPDATE
+ int insertToShard = distributionPolicy.chooseShardForInsert(tmpKey);
+ int deleteFromShard =
+ distributionPolicy.chooseShardForDelete(tmpKey);
+
+ if (insertToShard >= 0) {
+ if (insertToShard == deleteFromShard) {
+ // update into one shard
+ output.collect(shards[insertToShard], form);
+ } else {
+ // prepare a deletion form
+ IntermediateForm deletionForm = new IntermediateForm();
+ deletionForm.configure(iconf);
+ deletionForm.process(new DocumentAndOp(DocumentAndOp.Op.DELETE,
+ doc.getTerm()), analyzer);
+ deletionForm.closeWriter();
+
+ if (deleteFromShard >= 0) {
+ // delete from one shard
+ output.collect(shards[deleteFromShard], deletionForm);
+ } else {
+ // broadcast delete to all shards
+ for (int i = 0; i < shards.length; i++) {
+ output.collect(shards[i], deletionForm);
+ }
+ }
+
+ // prepare an insertion form
+ IntermediateForm insertionForm = new IntermediateForm();
+ insertionForm.configure(iconf);
+ insertionForm.process(new DocumentAndOp(DocumentAndOp.Op.INSERT,
+ doc.getDocument()), analyzer);
+ insertionForm.closeWriter();
+
+ // insert into one shard
+ output.collect(shards[insertToShard], insertionForm);
+ }
+ } else {
+ throw new IOException("Chosen shard for insert must be >= 0");
+ }
+ }
+ }
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.MapReduceBase#configure(org.apache.hadoop.mapred.JobConf)
+ */
+ public void configure(JobConf job) {
+ iconf = new IndexUpdateConfiguration(job);
+ analyzer =
+ (Analyzer) ReflectionUtils.newInstance(
+ iconf.getDocumentAnalyzerClass(), job);
+
+ localAnalysis =
+ (ILocalAnalysis) ReflectionUtils.newInstance(
+ iconf.getLocalAnalysisClass(), job);
+ localAnalysis.configure(job);
+
+ shards = Shard.getIndexShards(iconf);
+
+ distributionPolicy =
+ (IDistributionPolicy) ReflectionUtils.newInstance(
+ iconf.getDistributionPolicyClass(), job);
+ distributionPolicy.init(shards);
+
+ LOG.info("sea.document.analyzer = " + analyzer.getClass().getName());
+ LOG.info("sea.local.analysis = " + localAnalysis.getClass().getName());
+ LOG.info(shards.length + " shards = " + iconf.getIndexShards());
+ LOG.info("sea.distribution.policy = "
+ + distributionPolicy.getClass().getName());
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.MapReduceBase#close()
+ */
+ public void close() throws IOException {
+ localAnalysis.close();
+ }
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateMapper.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateOutputFormat.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateOutputFormat.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateOutputFormat.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateOutputFormat.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.mapred;
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputFormatBase;
+import org.apache.hadoop.mapred.RecordWriter;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.hadoop.util.Progressable;
+
+/**
+ * The record writer of this output format simply puts a message in an output
+ * path when a shard update is done.
+ */
+public class IndexUpdateOutputFormat extends OutputFormatBase<Shard, Text> {
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.OutputFormatBase#getRecordWriter(org.apache.hadoop.fs.FileSystem, org.apache.hadoop.mapred.JobConf, java.lang.String, org.apache.hadoop.util.Progressable)
+ */
+ public RecordWriter<Shard, Text> getRecordWriter(final FileSystem fs,
+ JobConf job, String name, final Progressable progress)
+ throws IOException {
+
+ final Path perm = new Path(job.getOutputPath(), name);
+
+ return new RecordWriter<Shard, Text>() {
+ public void write(Shard key, Text value) throws IOException {
+ assert (IndexUpdateReducer.DONE.equals(value));
+
+ String shardName = key.getDirectory();
+ shardName = shardName.replace("/", "_");
+
+ Path doneFile =
+ new Path(perm, IndexUpdateReducer.DONE + "_" + shardName);
+ if (!fs.exists(doneFile)) {
+ fs.createNewFile(doneFile);
+ }
+ }
+
+ public void close(final Reporter reporter) throws IOException {
+ }
+ };
+ }
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateOutputFormat.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdatePartitioner.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdatePartitioner.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdatePartitioner.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdatePartitioner.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.mapred;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.Partitioner;
+
+/**
+ * This partitioner class puts the values of the same key - in this case the
+ * same shard - in the same partition.
+ */
+public class IndexUpdatePartitioner implements
+ Partitioner<Shard, IntermediateForm> {
+
+ private Shard[] shards;
+ private Map<Shard, Integer> map;
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.Partitioner#getPartition(java.lang.Object, java.lang.Object, int)
+ */
+ public int getPartition(Shard key, IntermediateForm value, int numPartitions) {
+ int partition = map.get(key).intValue();
+ if (partition < numPartitions) {
+ return partition;
+ } else {
+ return numPartitions - 1;
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.JobConfigurable#configure(org.apache.hadoop.mapred.JobConf)
+ */
+ public void configure(JobConf job) {
+ shards = Shard.getIndexShards(new IndexUpdateConfiguration(job));
+ map = new HashMap<Shard, Integer>();
+ for (int i = 0; i < shards.length; i++) {
+ map.put(shards[i], i);
+ }
+ }
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdatePartitioner.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateReducer.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateReducer.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateReducer.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateReducer.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,143 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.mapred;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.contrib.index.lucene.ShardWriter;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Closeable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.io.WritableComparable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.MapReduceBase;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reducer;
+import org.apache.hadoop.mapred.Reporter;
+
+/**
+ * This reducer applies to a shard the changes for it. A "new version" of
+ * a shard is created at the end of a reduce. It is important to note that
+ * the new version of the shard is not derived from scratch. By leveraging
+ * Lucene's update algorithm, the new version of each Lucene instance will
+ * share as many files as possible as the previous version.
+ */
+public class IndexUpdateReducer extends MapReduceBase implements
+ Reducer<Shard, IntermediateForm, Shard, Text> {
+ static final Log LOG = LogFactory.getLog(IndexUpdateReducer.class);
+ static final Text DONE = new Text("done");
+
+ /**
+ * Get the reduce output key class.
+ * @return the reduce output key class
+ */
+ public static Class<? extends WritableComparable> getOutputKeyClass() {
+ return Shard.class;
+ }
+
+ /**
+ * Get the reduce output value class.
+ * @return the reduce output value class
+ */
+ public static Class<? extends Writable> getOutputValueClass() {
+ return Text.class;
+ }
+
+ private IndexUpdateConfiguration iconf;
+ private String mapredTempDir;
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.Reducer#reduce(java.lang.Object, java.util.Iterator, org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter)
+ */
+ public void reduce(Shard key, Iterator<IntermediateForm> values,
+ OutputCollector<Shard, Text> output, Reporter reporter)
+ throws IOException {
+
+ LOG.info("Construct a shard writer for " + key);
+ FileSystem fs = FileSystem.get(iconf.getConfiguration());
+ String temp =
+ mapredTempDir + Path.SEPARATOR + "shard_" + System.currentTimeMillis();
+ final ShardWriter writer = new ShardWriter(fs, key, temp, iconf);
+
+ // update the shard
+ while (values.hasNext()) {
+ IntermediateForm form = values.next();
+ writer.process(form);
+ reporter.progress();
+ }
+
+ // close the shard
+ final Reporter fReporter = reporter;
+ new Closeable() {
+ boolean closed = false;
+
+ public void close() throws IOException {
+ // spawn a thread to give progress heartbeats
+ Thread prog = new Thread() {
+ public void run() {
+ while (!closed) {
+ try {
+ fReporter.setStatus("closing");
+ Thread.sleep(1000);
+ } catch (InterruptedException e) {
+ continue;
+ } catch (Throwable e) {
+ return;
+ }
+ }
+ }
+ };
+
+ try {
+ prog.start();
+
+ if (writer != null) {
+ writer.close();
+ }
+ } finally {
+ closed = true;
+ }
+ }
+ }.close();
+ LOG.info("Closed the shard writer for " + key + ", writer = " + writer);
+
+ output.collect(key, DONE);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.MapReduceBase#configure(org.apache.hadoop.mapred.JobConf)
+ */
+ public void configure(JobConf job) {
+ iconf = new IndexUpdateConfiguration(job);
+ mapredTempDir = iconf.getMapredTempDir();
+ mapredTempDir = Shard.normalizePath(mapredTempDir);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.MapReduceBase#close()
+ */
+ public void close() throws IOException {
+ }
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateReducer.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdater.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdater.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdater.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdater.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,151 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.mapred;
+
+import java.io.IOException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.contrib.index.lucene.FileSystemDirectory;
+import org.apache.hadoop.contrib.index.lucene.LuceneUtil;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapred.JobClient;
+import org.apache.hadoop.mapred.JobConf;
+
+/**
+ * An implementation of an index updater interface which creates a Map/Reduce
+ * job configuration and run the Map/Reduce job to analyze documents and update
+ * Lucene instances in parallel.
+ */
+public class IndexUpdater implements IIndexUpdater {
+ public static final Log LOG = LogFactory.getLog(IndexUpdater.class);
+
+ public IndexUpdater() {
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.contrib.index.mapred.IIndexUpdater#run(org.apache.hadoop.conf.Configuration, org.apache.hadoop.fs.Path[], org.apache.hadoop.fs.Path, int, org.apache.hadoop.contrib.index.mapred.Shard[])
+ */
+ public void run(Configuration conf, Path[] inputPaths, Path outputPath,
+ int numMapTasks, Shard[] shards) throws IOException {
+ JobConf jobConf =
+ createJob(conf, inputPaths, outputPath, numMapTasks, shards);
+ JobClient.runJob(jobConf);
+ }
+
+ JobConf createJob(Configuration conf, Path[] inputPaths, Path outputPath,
+ int numMapTasks, Shard[] shards) throws IOException {
+ // set the starting generation for each shard
+ // when a reduce task fails, a new reduce task
+ // has to know where to re-start
+ setShardGeneration(conf, shards);
+
+ // iconf.set sets properties in conf
+ IndexUpdateConfiguration iconf = new IndexUpdateConfiguration(conf);
+ Shard.setIndexShards(iconf, shards);
+
+ // MapTask.MapOutputBuffer uses "io.sort.mb" to decide its max buffer size
+ // (max buffer size = 1/2 * "io.sort.mb").
+ // Here we half-en "io.sort.mb" because we use the other half memory to
+ // build an intermediate form/index in Combiner.
+ iconf.setIOSortMB(iconf.getIOSortMB() / 2);
+
+ // create the job configuration
+ JobConf jobConf = new JobConf(conf, IndexUpdater.class);
+ jobConf.setJobName(this.getClass().getName() + "_"
+ + System.currentTimeMillis());
+
+ // provided by application
+ jobConf.setInputPath(inputPaths[0]);
+ for (int i = 1; i < inputPaths.length; i++) {
+ jobConf.addInputPath(inputPaths[i]);
+ }
+
+ jobConf.setOutputPath(outputPath);
+
+ jobConf.setNumMapTasks(numMapTasks);
+
+ // already set shards
+ jobConf.setNumReduceTasks(shards.length);
+
+ jobConf.setInputFormat(iconf.getIndexInputFormatClass());
+
+ Path[] inputs = jobConf.getInputPaths();
+ StringBuilder buffer = new StringBuilder(inputs[0].toString());
+ for (int i = 1; i < inputs.length; i++) {
+ buffer.append(",");
+ buffer.append(inputs[i].toString());
+ }
+ LOG.info("mapred.input.dir = " + buffer.toString());
+ LOG.info("mapred.output.dir = " + jobConf.getOutputPath().toString());
+ LOG.info("mapred.map.tasks = " + jobConf.getNumMapTasks());
+ LOG.info("mapred.reduce.tasks = " + jobConf.getNumReduceTasks());
+ LOG.info(shards.length + " shards = " + iconf.getIndexShards());
+ // better if we don't create the input format instance
+ LOG.info("mapred.input.format.class = "
+ + jobConf.getInputFormat().getClass().getName());
+
+ // set by the system
+ jobConf.setMapOutputKeyClass(IndexUpdateMapper.getMapOutputKeyClass());
+ jobConf.setMapOutputValueClass(IndexUpdateMapper.getMapOutputValueClass());
+ jobConf.setOutputKeyClass(IndexUpdateReducer.getOutputKeyClass());
+ jobConf.setOutputValueClass(IndexUpdateReducer.getOutputValueClass());
+
+ jobConf.setMapperClass(IndexUpdateMapper.class);
+ jobConf.setPartitionerClass(IndexUpdatePartitioner.class);
+ jobConf.setCombinerClass(IndexUpdateCombiner.class);
+ jobConf.setReducerClass(IndexUpdateReducer.class);
+
+ jobConf.setOutputFormat(IndexUpdateOutputFormat.class);
+
+ return jobConf;
+ }
+
+ void setShardGeneration(Configuration conf, Shard[] shards)
+ throws IOException {
+ FileSystem fs = FileSystem.get(conf);
+
+ for (int i = 0; i < shards.length; i++) {
+ Path path = new Path(shards[i].getDirectory());
+ long generation = -1;
+
+ if (fs.exists(path)) {
+ FileSystemDirectory dir = null;
+
+ try {
+ dir = new FileSystemDirectory(fs, path, false, conf);
+ generation = LuceneUtil.getCurrentSegmentGeneration(dir);
+ } finally {
+ if (dir != null) {
+ dir.close();
+ }
+ }
+ }
+
+ if (generation != shards[i].getGeneration()) {
+ // set the starting generation for the shard
+ shards[i] =
+ new Shard(shards[i].getVersion(), shards[i].getDirectory(),
+ generation);
+ }
+ }
+ }
+}
\ No newline at end of file
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdater.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IntermediateForm.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IntermediateForm.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IntermediateForm.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IntermediateForm.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,239 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.mapred;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.concurrent.ConcurrentLinkedQueue;
+
+import org.apache.hadoop.contrib.index.lucene.RAMDirectoryUtil;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.RAMDirectory;
+
+/**
+ * An intermediate form for one or more parsed Lucene documents and/or
+ * delete terms. It actually uses Lucene file format as the format for
+ * the intermediate form by using RAM dir files.
+ *
+ * Note: If process(*) is ever called, closeWriter() should be called.
+ * Otherwise, no need to call closeWriter().
+ */
+public class IntermediateForm implements Writable {
+
+ private IndexUpdateConfiguration iconf = null;
+ private final Collection<Term> deleteList;
+ private RAMDirectory dir;
+ private IndexWriter writer;
+ private int numDocs;
+
+ /**
+ * Constructor
+ * @throws IOException
+ */
+ public IntermediateForm() throws IOException {
+ deleteList = new ConcurrentLinkedQueue<Term>();
+ dir = new RAMDirectory();
+ writer = null;
+ numDocs = 0;
+ }
+
+ /**
+ * Configure using an index update configuration.
+ * @param iconf the index update configuration
+ */
+ public void configure(IndexUpdateConfiguration iconf) {
+ this.iconf = iconf;
+ }
+
+ /**
+ * Get the ram directory of the intermediate form.
+ * @return the ram directory
+ */
+ public Directory getDirectory() {
+ return dir;
+ }
+
+ /**
+ * Get an iterator for the delete terms in the intermediate form.
+ * @return an iterator for the delete terms
+ */
+ public Iterator<Term> deleteTermIterator() {
+ return deleteList.iterator();
+ }
+
+ /**
+ * This method is used by the index update mapper and process a document
+ * operation into the current intermediate form.
+ * @param doc input document operation
+ * @param analyzer the analyzer
+ * @throws IOException
+ */
+ public void process(DocumentAndOp doc, Analyzer analyzer) throws IOException {
+ if (doc.getOp() == DocumentAndOp.Op.DELETE
+ || doc.getOp() == DocumentAndOp.Op.UPDATE) {
+ deleteList.add(doc.getTerm());
+
+ }
+
+ if (doc.getOp() == DocumentAndOp.Op.INSERT
+ || doc.getOp() == DocumentAndOp.Op.UPDATE) {
+
+ if (writer == null) {
+ // analyzer is null because we specify an analyzer with addDocument
+ writer = createWriter();
+ }
+
+ writer.addDocument(doc.getDocument(), analyzer);
+ numDocs++;
+ }
+
+ }
+
+ /**
+ * This method is used by the index update combiner and process an
+ * intermediate form into the current intermediate form. More specifically,
+ * the input intermediate forms are a single-document ram index and/or a
+ * single delete term.
+ * @param form the input intermediate form
+ * @throws IOException
+ */
+ public void process(IntermediateForm form) throws IOException {
+ if (form.deleteList.size() > 0) {
+ deleteList.addAll(form.deleteList);
+ }
+
+ if (form.dir.sizeInBytes() > 0) {
+ if (writer == null) {
+ writer = createWriter();
+ }
+
+ writer.addIndexesNoOptimize(new Directory[] { form.dir });
+ numDocs++;
+ }
+
+ }
+
+ /**
+ * Close the Lucene index writer associated with the intermediate form,
+ * if created. Do not close the ram directory. In fact, there is no need
+ * to close a ram directory.
+ * @throws IOException
+ */
+ public void closeWriter() throws IOException {
+ if (writer != null) {
+ writer.close();
+ writer = null;
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#toString()
+ */
+ public String toString() {
+ StringBuilder buffer = new StringBuilder();
+ buffer.append(this.getClass().getSimpleName());
+ buffer.append("[numDocs=");
+ buffer.append(numDocs);
+ buffer.append(", numDeletes=");
+ buffer.append(deleteList.size());
+ if (deleteList.size() > 0) {
+ buffer.append("(");
+ Iterator<Term> iter = deleteTermIterator();
+ while (iter.hasNext()) {
+ buffer.append(iter.next());
+ buffer.append(" ");
+ }
+ buffer.append(")");
+ }
+ buffer.append("]");
+ return buffer.toString();
+ }
+
+ private IndexWriter createWriter() throws IOException {
+ IndexWriter writer =
+ new IndexWriter(dir, false, null,
+ new KeepOnlyLastCommitDeletionPolicy());
+ writer.setUseCompoundFile(false);
+
+ if (iconf != null) {
+ int maxFieldLength = iconf.getIndexMaxFieldLength();
+ if (maxFieldLength > 0) {
+ writer.setMaxFieldLength(maxFieldLength);
+ }
+ }
+
+ return writer;
+ }
+
+ private void resetForm() throws IOException {
+ deleteList.clear();
+ if (dir.sizeInBytes() > 0) {
+ // it's ok if we don't close a ram directory
+ dir.close();
+ // an alternative is to delete all the files and reuse the ram directory
+ dir = new RAMDirectory();
+ }
+ assert (writer == null);
+ numDocs = 0;
+ }
+
+ // ///////////////////////////////////
+ // Writable
+ // ///////////////////////////////////
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
+ */
+ public void write(DataOutput out) throws IOException {
+ out.writeInt(deleteList.size());
+ for (Term term : deleteList) {
+ Text.writeString(out, term.field());
+ Text.writeString(out, term.text());
+ }
+
+ String[] files = dir.list();
+ RAMDirectoryUtil.writeRAMFiles(out, dir, files);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
+ */
+ public void readFields(DataInput in) throws IOException {
+ resetForm();
+
+ int numDeleteTerms = in.readInt();
+ for (int i = 0; i < numDeleteTerms; i++) {
+ String field = Text.readString(in);
+ String text = Text.readString(in);
+ deleteList.add(new Term(field, text));
+ }
+
+ RAMDirectoryUtil.readRAMFiles(in, dir);
+ }
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IntermediateForm.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/Shard.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/Shard.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/Shard.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/Shard.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,240 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.mapred;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.StringTokenizer;
+
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.WritableComparable;
+
+/**
+ * This class represents the metadata of a shard. Version is the version number
+ * of the entire index. Directory is the directory where this shard resides in.
+ * Generation is the Lucene index's generation. Version and generation are
+ * reserved for future use.
+ *
+ * Note: Currently the version number of the entire index is not used and
+ * defaults to -1.
+ */
+public class Shard implements WritableComparable {
+
+ // This method is copied from Path.
+ public static String normalizePath(String path) {
+ // remove double slashes & backslashes
+ path = path.replace("//", "/");
+ path = path.replace("\\", "/");
+
+ // trim trailing slash from non-root path (ignoring windows drive)
+ if (path.length() > 1 && path.endsWith("/")) {
+ path = path.substring(0, path.length() - 1);
+ }
+
+ return path;
+ }
+
+ public static void setIndexShards(IndexUpdateConfiguration conf,
+ Shard[] shards) {
+ StringBuilder shardsString = new StringBuilder(shards[0].toString());
+ for (int i = 1; i < shards.length; i++) {
+ shardsString.append(",");
+ shardsString.append(shards[i].toString());
+ }
+ conf.setIndexShards(shardsString.toString());
+ }
+
+ public static Shard[] getIndexShards(IndexUpdateConfiguration conf) {
+ String shards = conf.getIndexShards();
+ if (shards != null) {
+ ArrayList<Object> list =
+ Collections.list(new StringTokenizer(shards, ","));
+ Shard[] result = new Shard[list.size()];
+ for (int i = 0; i < list.size(); i++) {
+ result[i] = Shard.createShardFromString((String) list.get(i));
+ }
+ return result;
+ } else {
+ return null;
+ }
+ }
+
+ // assume str is formatted correctly as a shard string
+ private static Shard createShardFromString(String str) {
+ int first = str.indexOf("@");
+ int second = str.indexOf("@", first + 1);
+ long version = Long.parseLong(str.substring(0, first));
+ String dir = str.substring(first + 1, second);
+ long gen = Long.parseLong(str.substring(second + 1));
+ return new Shard(version, dir, gen);
+ }
+
+ // index/shard version
+ // the shards in the same version of an index have the same version number
+ private long version;
+ private String dir;
+ private long gen; // Lucene's generation
+
+ /**
+ * Constructor.
+ */
+ public Shard() {
+ this.version = -1;
+ this.dir = null;
+ this.gen = -1;
+ }
+
+ /**
+ * Construct a shard from a versio number, a directory and a generation
+ * number.
+ * @param version the version number of the entire index
+ * @param dir the directory where this shard resides
+ * @param gen the generation of the Lucene instance
+ */
+ public Shard(long version, String dir, long gen) {
+ this.version = version;
+ this.dir = normalizePath(dir);
+ this.gen = gen;
+ }
+
+ /**
+ * Construct using a shard object.
+ * @param shard the shard used by the constructor
+ */
+ public Shard(Shard shard) {
+ this.version = shard.version;
+ this.dir = shard.dir;
+ this.gen = shard.gen;
+ }
+
+ /**
+ * Get the version number of the entire index.
+ * @return the version number of the entire index
+ */
+ public long getVersion() {
+ return version;
+ }
+
+ /**
+ * Get the directory where this shard resides.
+ * @return the directory where this shard resides
+ */
+ public String getDirectory() {
+ return dir;
+ }
+
+ /**
+ * Get the generation of the Lucene instance.
+ * @return the generation of the Lucene instance
+ */
+ public long getGeneration() {
+ return gen;
+ }
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#toString()
+ */
+ public String toString() {
+ return version + "@" + dir + "@" + gen;
+ }
+
+ // ///////////////////////////////////
+ // Writable
+ // ///////////////////////////////////
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
+ */
+ public void write(DataOutput out) throws IOException {
+ out.writeLong(version);
+ Text.writeString(out, dir);
+ out.writeLong(gen);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
+ */
+ public void readFields(DataInput in) throws IOException {
+ version = in.readLong();
+ dir = Text.readString(in);
+ gen = in.readLong();
+ }
+
+ // ///////////////////////////////////
+ // Comparable
+ // ///////////////////////////////////
+ /* (non-Javadoc)
+ * @see java.lang.Comparable#compareTo(java.lang.Object)
+ */
+ public int compareTo(Object o) {
+ return compareTo((Shard) o);
+ }
+
+ /**
+ * Compare to another shard.
+ * @param other another shard
+ * @return compare version first, then directory and finally generation
+ */
+ public int compareTo(Shard other) {
+ // compare version
+ if (version < other.version) {
+ return -1;
+ } else if (version > other.version) {
+ return 1;
+ }
+ // compare dir
+ int result = dir.compareTo(other.dir);
+ if (result != 0) {
+ return result;
+ }
+ // compare gen
+ if (gen < other.gen) {
+ return -1;
+ } else if (gen == other.gen) {
+ return 0;
+ } else {
+ return 1;
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#equals(java.lang.Object)
+ */
+ public boolean equals(Object o) {
+ if (this == o) {
+ return true;
+ }
+ if (!(o instanceof Shard)) {
+ return false;
+ }
+ Shard other = (Shard) o;
+ return version == other.version && dir.equals(other.dir)
+ && gen == other.gen;
+ }
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#hashCode()
+ */
+ public int hashCode() {
+ return (int) version ^ dir.hashCode() ^ (int) gen;
+ }
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/Shard.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/test/org/apache/hadoop/contrib/index/lucene/TestMixedDirectory.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/test/org/apache/hadoop/contrib/index/lucene/TestMixedDirectory.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/test/org/apache/hadoop/contrib/index/lucene/TestMixedDirectory.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/test/org/apache/hadoop/contrib/index/lucene/TestMixedDirectory.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,105 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.lucene;
+
+import java.io.IOException;
+
+import junit.framework.TestCase;
+
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexDeletionPolicy;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMDirectory;
+
+public class TestMixedDirectory extends TestCase {
+ private int numDocsPerUpdate = 10;
+ private int maxBufferedDocs = 2;
+
+ public void testMixedDirectoryAndPolicy() throws IOException {
+ Directory readDir = new RAMDirectory();
+ updateIndex(readDir, 0, numDocsPerUpdate,
+ new KeepOnlyLastCommitDeletionPolicy());
+
+ verify(readDir, numDocsPerUpdate);
+
+ IndexOutput out =
+ readDir.createOutput("_" + (numDocsPerUpdate / maxBufferedDocs + 2)
+ + ".cfs");
+ out.writeInt(0);
+ out.close();
+
+ Directory writeDir = new RAMDirectory();
+ Directory mixedDir = new MixedDirectory(readDir, writeDir);
+ updateIndex(mixedDir, numDocsPerUpdate, numDocsPerUpdate,
+ new MixedDeletionPolicy());
+
+ verify(readDir, numDocsPerUpdate);
+ verify(mixedDir, 2 * numDocsPerUpdate);
+ }
+
+ public void updateIndex(Directory dir, int base, int numDocs,
+ IndexDeletionPolicy policy) throws IOException {
+ IndexWriter writer =
+ new IndexWriter(dir, false, new StandardAnalyzer(), policy);
+ writer.setMaxBufferedDocs(maxBufferedDocs);
+ writer.setMergeFactor(1000);
+ for (int i = 0; i < numDocs; i++) {
+ addDoc(writer, base + i);
+ }
+ writer.close();
+ }
+
+ private void addDoc(IndexWriter writer, int id) throws IOException {
+ Document doc = new Document();
+ doc.add(new Field("id", String.valueOf(id), Field.Store.YES,
+ Field.Index.UN_TOKENIZED));
+ doc.add(new Field("content", "apache", Field.Store.NO,
+ Field.Index.TOKENIZED));
+ writer.addDocument(doc);
+ }
+
+ private void verify(Directory dir, int expectedHits) throws IOException {
+ IndexSearcher searcher = new IndexSearcher(dir);
+ Hits hits = searcher.search(new TermQuery(new Term("content", "apache")));
+ int numHits = hits.length();
+
+ assertEquals(expectedHits, numHits);
+
+ int[] docs = new int[numHits];
+ for (int i = 0; i < numHits; i++) {
+ Document hit = hits.doc(i);
+ docs[Integer.parseInt(hit.get("id"))]++;
+ }
+ for (int i = 0; i < numHits; i++) {
+ assertEquals(1, docs[i]);
+ }
+
+ searcher.close();
+ }
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/test/org/apache/hadoop/contrib/index/lucene/TestMixedDirectory.java
------------------------------------------------------------------------------
svn:executable = *