You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by cu...@apache.org on 2008/03/20 04:33:24 UTC
svn commit: r639138 [1/3] - in /hadoop/core/trunk: ./ src/contrib/index/
src/contrib/index/conf/ src/contrib/index/lib/ src/contrib/index/sample/
src/contrib/index/src/ src/contrib/index/src/java/
src/contrib/index/src/java/org/ src/contrib/index/src/j...
Author: cutting
Date: Wed Mar 19 20:33:18 2008
New Revision: 639138
URL: http://svn.apache.org/viewvc?rev=639138&view=rev
Log:
HADOOP-2951. Add a contrib module that provides a utility to build or update Lucene indexes using MapReduce. Contributed by Ning Li.
Added:
hadoop/core/trunk/src/contrib/index/
hadoop/core/trunk/src/contrib/index/README (with props)
hadoop/core/trunk/src/contrib/index/build.xml (with props)
hadoop/core/trunk/src/contrib/index/conf/
hadoop/core/trunk/src/contrib/index/conf/index-config.xml.template (with props)
hadoop/core/trunk/src/contrib/index/lib/
hadoop/core/trunk/src/contrib/index/lib/lucene-core-2.3.1.jar (with props)
hadoop/core/trunk/src/contrib/index/sample/
hadoop/core/trunk/src/contrib/index/sample/data.txt (with props)
hadoop/core/trunk/src/contrib/index/sample/data2.txt (with props)
hadoop/core/trunk/src/contrib/index/src/
hadoop/core/trunk/src/contrib/index/src/java/
hadoop/core/trunk/src/contrib/index/src/java/org/
hadoop/core/trunk/src/contrib/index/src/java/org/apache/
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/HashingDistributionPolicy.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/IdentityLocalAnalysis.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocInputFormat.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocLocalAnalysis.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocRecordReader.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocTextAndOp.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/RoundRobinDistributionPolicy.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/FileSystemDirectory.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/LuceneIndexFileNameFilter.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/LuceneUtil.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/MixedDeletionPolicy.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/MixedDirectory.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/RAMDirectoryUtil.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/ShardWriter.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/main/
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/main/UpdateIndex.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/DocumentAndOp.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/DocumentID.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IDistributionPolicy.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IIndexUpdater.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/ILocalAnalysis.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateCombiner.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateConfiguration.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateMapper.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateOutputFormat.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdatePartitioner.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdateReducer.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IndexUpdater.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/IntermediateForm.java (with props)
hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/mapred/Shard.java (with props)
hadoop/core/trunk/src/contrib/index/src/test/
hadoop/core/trunk/src/contrib/index/src/test/org/
hadoop/core/trunk/src/contrib/index/src/test/org/apache/
hadoop/core/trunk/src/contrib/index/src/test/org/apache/hadoop/
hadoop/core/trunk/src/contrib/index/src/test/org/apache/hadoop/contrib/
hadoop/core/trunk/src/contrib/index/src/test/org/apache/hadoop/contrib/index/
hadoop/core/trunk/src/contrib/index/src/test/org/apache/hadoop/contrib/index/lucene/
hadoop/core/trunk/src/contrib/index/src/test/org/apache/hadoop/contrib/index/lucene/TestMixedDirectory.java (with props)
hadoop/core/trunk/src/contrib/index/src/test/org/apache/hadoop/contrib/index/mapred/
hadoop/core/trunk/src/contrib/index/src/test/org/apache/hadoop/contrib/index/mapred/TestDistributionPolicy.java (with props)
hadoop/core/trunk/src/contrib/index/src/test/org/apache/hadoop/contrib/index/mapred/TestIndexUpdater.java (with props)
Modified:
hadoop/core/trunk/CHANGES.txt
hadoop/core/trunk/build.xml
Modified: hadoop/core/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/CHANGES.txt?rev=639138&r1=639137&r2=639138&view=diff
==============================================================================
--- hadoop/core/trunk/CHANGES.txt (original)
+++ hadoop/core/trunk/CHANGES.txt Wed Mar 19 20:33:18 2008
@@ -56,6 +56,9 @@
HADOOP-2346. Utilities to support timeout while writing to sockets.
DFSClient and DataNode sockets have 10min write timeout. (rangadi)
+ HADOOP-2951. Add a contrib module that provides a utility to
+ build or update Lucene indexes using Map/Reduce. (Ning Li via cutting)
+
IMPROVEMENTS
HADOOP-2655. Copy on write for data and metadata files in the
Modified: hadoop/core/trunk/build.xml
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/build.xml?rev=639138&r1=639137&r2=639138&view=diff
==============================================================================
--- hadoop/core/trunk/build.xml (original)
+++ hadoop/core/trunk/build.xml Wed Mar 19 20:33:18 2008
@@ -698,6 +698,7 @@
<packageset dir="src/contrib/streaming/src/java"/>
<packageset dir="src/contrib/data_join/src/java"/>
+ <packageset dir="src/contrib/index/src/java"/>
<link href="${javadoc.link.java}"/>
@@ -714,6 +715,7 @@
<group title="contrib: Streaming" packages="org.apache.hadoop.streaming*"/>
<group title="contrib: DataJoin" packages="org.apache.hadoop.contrib.utils.join*"/>
+ <group title="contrib: Index" packages="org.apache.hadoop.contrib.index*"/>
</javadoc>
</target>
Added: hadoop/core/trunk/src/contrib/index/README
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/README?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/README (added)
+++ hadoop/core/trunk/src/contrib/index/README Wed Mar 19 20:33:18 2008
@@ -0,0 +1,43 @@
+This contrib package provides a utility to build or update an index
+using Map/Reduce.
+
+A distributed "index" is partitioned into "shards". Each shard corresponds
+to a Lucene instance. org.apache.hadoop.contrib.index.main.UpdateIndex
+contains the main() method which uses a Map/Reduce job to analyze documents
+and update Lucene instances in parallel.
+
+The Map phase of the Map/Reduce job formats, analyzes and parses the input
+(in parallel), while the Reduce phase collects and applies the updates to
+each Lucene instance (again in parallel). The updates are applied using the
+local file system where a Reduce task runs and then copied back to HDFS.
+For example, if the updates caused a new Lucene segment to be created, the
+new segment would be created on the local file system first, and then
+copied back to HDFS.
+
+When the Map/Reduce job completes, a "new version" of the index is ready
+to be queried. It is important to note that the new version of the index
+is not derived from scratch. By leveraging Lucene's update algorithm, the
+new version of each Lucene instance will share as many files as possible
+as the previous version.
+
+The main() method in UpdateIndex requires the following information for
+updating the shards:
+ - Input formatter. This specifies how to format the input documents.
+ - Analysis. This defines the analyzer to use on the input. The analyzer
+ determines whether a document is being inserted, updated, or deleted.
+ For inserts or updates, the analyzer also converts each input document
+ into a Lucene document.
+ - Input paths. This provides the location(s) of updated documents,
+ e.g., HDFS files or directories, or HBase tables.
+ - Shard paths, or index path with the number of shards. Either specify
+ the path for each shard, or specify an index path and the shards are
+ the sub-directories of the index directory.
+ - Output path. When the update to a shard is done, a message is put here.
+ - Number of map tasks.
+
+All of the information can be specified in a configuration file. All but
+the first two can also be specified as command line options. Check out
+conf/index-config.xml.template for other configurable parameters.
+
+Note: Because of the parallel nature of Map/Reduce, the behaviour of
+multiple inserts, deletes or updates to the same document is undefined.
Propchange: hadoop/core/trunk/src/contrib/index/README
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/build.xml
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/build.xml?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/build.xml (added)
+++ hadoop/core/trunk/src/contrib/index/build.xml Wed Mar 19 20:33:18 2008
@@ -0,0 +1,80 @@
+<?xml version="1.0"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<project name="index" default="jar">
+
+ <import file="../build-contrib.xml"/>
+
+ <!-- Override jar target to specify main class -->
+ <target name="jar" depends="compile" unless="skip.contrib">
+ <echo message="contrib: ${name}"/>
+ <jar
+ jarfile="${build.dir}/hadoop-${version}-${name}.jar"
+ basedir="${build.classes}"
+ >
+ <manifest>
+ <attribute name="Main-Class" value="org.apache.hadoop.contrib.index.main.UpdateIndex"/>
+ </manifest>
+ </jar>
+ </target>
+
+ <!-- Override test target to copy sample data -->
+ <target name="test" depends="compile-test, compile, compile-examples" if="test.available">
+ <echo message="contrib: ${name}"/>
+ <delete dir="${hadoop.log.dir}"/>
+ <mkdir dir="${hadoop.log.dir}"/>
+ <delete dir="${build.test}/sample"/>
+ <mkdir dir="${build.test}/sample"/>
+ <copy todir="${build.test}/sample">
+ <fileset dir="${root}/sample"/>
+ </copy>
+ <junit
+ printsummary="yes" showoutput="${test.output}"
+ haltonfailure="no" fork="yes" maxmemory="256m"
+ errorProperty="tests.failed" failureProperty="tests.failed"
+ timeout="${test.timeout}">
+
+ <sysproperty key="test.build.data" value="${build.test}/data"/>
+ <sysproperty key="build.test" value="${build.test}"/>
+ <sysproperty key="contrib.name" value="${name}"/>
+
+ <!-- requires fork=yes for:
+ relative File paths to use the specified user.dir
+ classpath to use build/contrib/*.jar
+ -->
+ <sysproperty key="user.dir" value="${build.test}/data"/>
+
+ <sysproperty key="fs.default.name" value="${fs.default.name}"/>
+ <sysproperty key="hadoop.test.localoutputfile" value="${hadoop.test.localoutputfile}"/>
+ <sysproperty key="hadoop.log.dir" value="${hadoop.log.dir}"/>
+ <classpath refid="test.classpath"/>
+ <formatter type="${test.junit.output.format}" />
+ <batchtest todir="${build.test}" unless="testcase">
+ <fileset dir="${src.test}"
+ includes="**/Test*.java" excludes="**/${test.exclude}.java" />
+ </batchtest>
+ <batchtest todir="${build.test}" if="testcase">
+ <fileset dir="${src.test}" includes="**/${testcase}.java"/>
+ </batchtest>
+ </junit>
+ <fail if="tests.failed">Tests failed!</fail>
+
+ </target>
+
+</project>
Propchange: hadoop/core/trunk/src/contrib/index/build.xml
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/conf/index-config.xml.template
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/conf/index-config.xml.template?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/conf/index-config.xml.template (added)
+++ hadoop/core/trunk/src/contrib/index/conf/index-config.xml.template Wed Mar 19 20:33:18 2008
@@ -0,0 +1,48 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+
+<!-- Put site-specific property overrides in this file. -->
+
+<configuration>
+
+<property>
+ <name>sea.distribution.policy</name>
+ <value>org.apache.hadoop.contrib.index.example.HashingDistributionPolicy</value>
+</property>
+
+<property>
+ <name>sea.document.analyzer</name>
+ <value>org.apache.lucene.analysis.standard.StandardAnalyzer</value>
+</property>
+
+<property>
+ <name>sea.input.format</name>
+ <value>org.apache.hadoop.contrib.index.example.LineDocInputFormat</value>
+</property>
+
+<property>
+ <name>sea.index.updater</name>
+ <value>org.apache.hadoop.contrib.index.mapred.IndexUpdater</value>
+</property>
+
+<property>
+ <name>sea.local.analysis</name>
+ <value>org.apache.hadoop.contrib.index.example.LineDocLocalAnalysis</value>
+</property>
+
+<property>
+ <name>sea.max.field.length</name>
+ <value>2000000</value>
+</property>
+
+<property>
+ <name>sea.max.num.segments</name>
+ <value>10</value>
+</property>
+
+<property>
+ <name>sea.use.compound.file</name>
+ <value>true</value>
+</property>
+
+</configuration>
Propchange: hadoop/core/trunk/src/contrib/index/conf/index-config.xml.template
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/lib/lucene-core-2.3.1.jar
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/lib/lucene-core-2.3.1.jar?rev=639138&view=auto
==============================================================================
Binary file - no diff available.
Propchange: hadoop/core/trunk/src/contrib/index/lib/lucene-core-2.3.1.jar
------------------------------------------------------------------------------
svn:executable = *
Propchange: hadoop/core/trunk/src/contrib/index/lib/lucene-core-2.3.1.jar
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: hadoop/core/trunk/src/contrib/index/sample/data.txt
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/sample/data.txt?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/sample/data.txt (added)
+++ hadoop/core/trunk/src/contrib/index/sample/data.txt Wed Mar 19 20:33:18 2008
@@ -0,0 +1,10 @@
+0 ins apache dot org
+1 ins apache
+2 ins apache
+3 ins apache
+4 ins apache
+5 ins apache
+6 ins apache
+7 ins apache
+8 ins apache
+9 ins apache
Propchange: hadoop/core/trunk/src/contrib/index/sample/data.txt
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/sample/data2.txt
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/sample/data2.txt?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/sample/data2.txt (added)
+++ hadoop/core/trunk/src/contrib/index/sample/data2.txt Wed Mar 19 20:33:18 2008
@@ -0,0 +1,10 @@
+0 del
+1 upd hadoop
+2 del
+3 upd hadoop
+4 del
+5 upd hadoop
+6 del
+7 upd hadoop
+8 del
+9 upd hadoop
Propchange: hadoop/core/trunk/src/contrib/index/sample/data2.txt
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/HashingDistributionPolicy.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/HashingDistributionPolicy.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/HashingDistributionPolicy.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/HashingDistributionPolicy.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.example;
+
+import org.apache.hadoop.contrib.index.mapred.DocumentID;
+import org.apache.hadoop.contrib.index.mapred.IDistributionPolicy;
+import org.apache.hadoop.contrib.index.mapred.Shard;
+
+/**
+ * Choose a shard for each insert or delete based on document id hashing. Do
+ * NOT use this distribution policy when the number of shards changes.
+ */
+public class HashingDistributionPolicy implements IDistributionPolicy {
+
+ private int numShards;
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#init(org.apache.hadoop.contrib.index.mapred.Shard[])
+ */
+ public void init(Shard[] shards) {
+ numShards = shards.length;
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#chooseShardForInsert(org.apache.hadoop.contrib.index.mapred.DocumentID)
+ */
+ public int chooseShardForInsert(DocumentID key) {
+ int hashCode = key.hashCode();
+ return hashCode >= 0 ? hashCode % numShards : (-hashCode) % numShards;
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#chooseShardForDelete(org.apache.hadoop.contrib.index.mapred.DocumentID)
+ */
+ public int chooseShardForDelete(DocumentID key) {
+ int hashCode = key.hashCode();
+ return hashCode >= 0 ? hashCode % numShards : (-hashCode) % numShards;
+ }
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/HashingDistributionPolicy.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/IdentityLocalAnalysis.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/IdentityLocalAnalysis.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/IdentityLocalAnalysis.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/IdentityLocalAnalysis.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,57 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.example;
+
+import java.io.IOException;
+
+import org.apache.hadoop.contrib.index.mapred.DocumentAndOp;
+import org.apache.hadoop.contrib.index.mapred.DocumentID;
+import org.apache.hadoop.contrib.index.mapred.ILocalAnalysis;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+
+/**
+ * Identity local analysis maps inputs directly into outputs.
+ */
+public class IdentityLocalAnalysis implements
+ ILocalAnalysis<DocumentID, DocumentAndOp> {
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.Mapper#map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter)
+ */
+ public void map(DocumentID key, DocumentAndOp value,
+ OutputCollector<DocumentID, DocumentAndOp> output, Reporter reporter)
+ throws IOException {
+ output.collect(key, value);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.JobConfigurable#configure(org.apache.hadoop.mapred.JobConf)
+ */
+ public void configure(JobConf job) {
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.io.Closeable#close()
+ */
+ public void close() throws IOException {
+ }
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/IdentityLocalAnalysis.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocInputFormat.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocInputFormat.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocInputFormat.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocInputFormat.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.example;
+
+import java.io.IOException;
+
+import org.apache.hadoop.contrib.index.mapred.DocumentID;
+import org.apache.hadoop.mapred.FileInputFormat;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.InputSplit;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.RecordReader;
+import org.apache.hadoop.mapred.Reporter;
+
+/**
+ * An InputFormat for LineDoc for plain text files where each line is a doc.
+ */
+public class LineDocInputFormat extends
+ FileInputFormat<DocumentID, LineDocTextAndOp> {
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.FileInputFormat#getRecordReader(org.apache.hadoop.mapred.InputSplit, org.apache.hadoop.mapred.JobConf, org.apache.hadoop.mapred.Reporter)
+ */
+ public RecordReader<DocumentID, LineDocTextAndOp> getRecordReader(
+ InputSplit split, JobConf job, Reporter reporter) throws IOException {
+ reporter.setStatus(split.toString());
+ return new LineDocRecordReader(job, (FileSplit) split);
+ }
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocInputFormat.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocLocalAnalysis.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocLocalAnalysis.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocLocalAnalysis.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocLocalAnalysis.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.example;
+
+import java.io.IOException;
+
+import org.apache.hadoop.contrib.index.mapred.DocumentAndOp;
+import org.apache.hadoop.contrib.index.mapred.DocumentID;
+import org.apache.hadoop.contrib.index.mapred.ILocalAnalysis;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.mapred.Reporter;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.Term;
+
+/**
+ * Convert LineDocTextAndOp to DocumentAndOp as required by ILocalAnalysis.
+ */
+public class LineDocLocalAnalysis implements
+ ILocalAnalysis<DocumentID, LineDocTextAndOp> {
+
+ private static String docidFieldName = "id";
+ private static String contentFieldName = "content";
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.Mapper#map(java.lang.Object, java.lang.Object, org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter)
+ */
+ public void map(DocumentID key, LineDocTextAndOp value,
+ OutputCollector<DocumentID, DocumentAndOp> output, Reporter reporter)
+ throws IOException {
+
+ DocumentAndOp.Op op = value.getOp();
+ Document doc = null;
+ Term term = null;
+
+ if (op == DocumentAndOp.Op.INSERT || op == DocumentAndOp.Op.UPDATE) {
+ doc = new Document();
+ doc.add(new Field(docidFieldName, key.getText().toString(),
+ Field.Store.YES, Field.Index.UN_TOKENIZED));
+ doc.add(new Field(contentFieldName, value.getText().toString(),
+ Field.Store.NO, Field.Index.TOKENIZED));
+ }
+
+ if (op == DocumentAndOp.Op.DELETE || op == DocumentAndOp.Op.UPDATE) {
+ term = new Term(docidFieldName, key.getText().toString());
+ }
+
+ output.collect(key, new DocumentAndOp(op, doc, term));
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.JobConfigurable#configure(org.apache.hadoop.mapred.JobConf)
+ */
+ public void configure(JobConf job) {
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.io.Closeable#close()
+ */
+ public void close() throws IOException {
+ }
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocLocalAnalysis.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocRecordReader.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocRecordReader.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocRecordReader.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocRecordReader.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,231 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.example;
+
+import java.io.BufferedInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.contrib.index.mapred.DocumentAndOp;
+import org.apache.hadoop.contrib.index.mapred.DocumentID;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapred.FileSplit;
+import org.apache.hadoop.mapred.RecordReader;
+
+/**
+ * A simple RecordReader for LineDoc for plain text files where each line is a
+ * doc. Each line is as follows: documentID<SPACE>op<SPACE>content<EOF>,
+ * where op can be "i", "ins" or "insert" for insert, "d", "del" or "delete"
+ * for delete, or "u", "upd" or "update" for update.
+ */
+public class LineDocRecordReader implements
+ RecordReader<DocumentID, LineDocTextAndOp> {
+ private static final char SPACE = ' ';
+ private static final char EOL = '\n';
+
+ private long start;
+ private long pos;
+ private long end;
+ private BufferedInputStream in;
+ private ByteArrayOutputStream buffer = new ByteArrayOutputStream(256);
+
+ /**
+ * Provide a bridge to get the bytes from the ByteArrayOutputStream without
+ * creating a new byte array.
+ */
+ private static class TextStuffer extends OutputStream {
+ public Text target;
+
+ public void write(int b) {
+ throw new UnsupportedOperationException("write(byte) not supported");
+ }
+
+ public void write(byte[] data, int offset, int len) throws IOException {
+ target.set(data, offset, len);
+ }
+ }
+
+ private TextStuffer bridge = new TextStuffer();
+
+ /**
+ * Constructor
+ * @param job
+ * @param split
+ * @throws IOException
+ */
+ public LineDocRecordReader(Configuration job, FileSplit split)
+ throws IOException {
+ long start = split.getStart();
+ long end = start + split.getLength();
+ final Path file = split.getPath();
+
+ // open the file and seek to the start of the split
+ FileSystem fs = file.getFileSystem(job);
+ FSDataInputStream fileIn = fs.open(split.getPath());
+ InputStream in = fileIn;
+ boolean skipFirstLine = false;
+ if (start != 0) {
+ skipFirstLine = true; // wait till BufferedInputStream to skip
+ --start;
+ fileIn.seek(start);
+ }
+
+ this.in = new BufferedInputStream(in);
+ if (skipFirstLine) { // skip first line and re-establish "start".
+ start += LineDocRecordReader.readData(this.in, null, EOL);
+ }
+ this.start = start;
+ this.pos = start;
+ this.end = end;
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.RecordReader#close()
+ */
+ public void close() throws IOException {
+ in.close();
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.RecordReader#createKey()
+ */
+ public DocumentID createKey() {
+ return new DocumentID();
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.RecordReader#createValue()
+ */
+ public LineDocTextAndOp createValue() {
+ return new LineDocTextAndOp();
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.RecordReader#getPos()
+ */
+ public long getPos() throws IOException {
+ return pos;
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.RecordReader#getProgress()
+ */
+ public float getProgress() throws IOException {
+ if (start == end) {
+ return 0.0f;
+ } else {
+ return Math.min(1.0f, (pos - start) / (float) (end - start));
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.mapred.RecordReader#next(java.lang.Object, java.lang.Object)
+ */
+ public synchronized boolean next(DocumentID key, LineDocTextAndOp value)
+ throws IOException {
+ if (pos >= end) {
+ return false;
+ }
+
+ // key is document id, which are bytes until first space
+ if (!readInto(key.getText(), SPACE)) {
+ return false;
+ }
+
+ // read operation: i/d/u, or ins/del/upd, or insert/delete/update
+ Text opText = new Text();
+ if (!readInto(opText, SPACE)) {
+ return false;
+ }
+ String opStr = opText.toString();
+ DocumentAndOp.Op op;
+ if (opStr.equals("i") || opStr.equals("ins") || opStr.equals("insert")) {
+ op = DocumentAndOp.Op.INSERT;
+ } else if (opStr.equals("d") || opStr.equals("del")
+ || opStr.equals("delete")) {
+ op = DocumentAndOp.Op.DELETE;
+ } else if (opStr.equals("u") || opStr.equals("upd")
+ || opStr.equals("update")) {
+ op = DocumentAndOp.Op.UPDATE;
+ } else {
+ // default is insert
+ op = DocumentAndOp.Op.INSERT;
+ }
+ value.setOp(op);
+
+ if (op == DocumentAndOp.Op.DELETE) {
+ return true;
+ } else {
+ // read rest of the line
+ return readInto(value.getText(), EOL);
+ }
+ }
+
+ private boolean readInto(Text text, char delimiter) throws IOException {
+ buffer.reset();
+ long bytesRead = readData(in, buffer, delimiter);
+ if (bytesRead == 0) {
+ return false;
+ }
+ pos += bytesRead;
+ bridge.target = text;
+ buffer.writeTo(bridge);
+ return true;
+ }
+
+ private static long readData(InputStream in, OutputStream out, char delimiter)
+ throws IOException {
+ long bytes = 0;
+ while (true) {
+
+ int b = in.read();
+ if (b == -1) {
+ break;
+ }
+ bytes += 1;
+
+ byte c = (byte) b;
+ if (c == EOL || c == delimiter) {
+ break;
+ }
+
+ if (c == '\r') {
+ in.mark(1);
+ byte nextC = (byte) in.read();
+ if (nextC != EOL || c == delimiter) {
+ in.reset();
+ } else {
+ bytes += 1;
+ }
+ break;
+ }
+
+ if (out != null) {
+ out.write(c);
+ }
+ }
+ return bytes;
+ }
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocRecordReader.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocTextAndOp.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocTextAndOp.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocTextAndOp.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocTextAndOp.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,92 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.example;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.contrib.index.mapred.DocumentAndOp;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+/**
+ * This class represents an operation. The operation can be an insert, a delete
+ * or an update. If the operation is an insert or an update, a (new) document,
+ * which is in the form of text, is specified.
+ */
+public class LineDocTextAndOp implements Writable {
+ private DocumentAndOp.Op op;
+ private Text doc;
+
+ /**
+ * Constructor
+ */
+ public LineDocTextAndOp() {
+ doc = new Text();
+ }
+
+ /**
+ * Set the type of the operation.
+ * @param op the type of the operation
+ */
+ public void setOp(DocumentAndOp.Op op) {
+ this.op = op;
+ }
+
+ /**
+ * Get the type of the operation.
+ * @return the type of the operation
+ */
+ public DocumentAndOp.Op getOp() {
+ return op;
+ }
+
+ /**
+ * Get the text that represents a document.
+ * @return the text that represents a document
+ */
+ public Text getText() {
+ return doc;
+ }
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#toString()
+ */
+ public String toString() {
+ return this.getClass().getName() + "[op=" + op + ", text=" + doc + "]";
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
+ */
+ public void write(DataOutput out) throws IOException {
+ throw new IOException(this.getClass().getName()
+ + ".write should never be called");
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
+ */
+ public void readFields(DataInput in) throws IOException {
+ throw new IOException(this.getClass().getName()
+ + ".readFields should never be called");
+ }
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/LineDocTextAndOp.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/RoundRobinDistributionPolicy.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/RoundRobinDistributionPolicy.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/RoundRobinDistributionPolicy.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/RoundRobinDistributionPolicy.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.example;
+
+import org.apache.hadoop.contrib.index.mapred.DocumentID;
+import org.apache.hadoop.contrib.index.mapred.IDistributionPolicy;
+import org.apache.hadoop.contrib.index.mapred.Shard;
+
+/**
+ * Choose a shard for each insert in a round-robin fashion. Choose all the
+ * shards for each delete because we don't know where it is stored.
+ */
+public class RoundRobinDistributionPolicy implements IDistributionPolicy {
+
+ private int numShards;
+ private int rr; // round-robin implementation
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#init(org.apache.hadoop.contrib.index.mapred.Shard[])
+ */
+ public void init(Shard[] shards) {
+ numShards = shards.length;
+ rr = 0;
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#chooseShardForInsert(org.apache.hadoop.contrib.index.mapred.DocumentID)
+ */
+ public int chooseShardForInsert(DocumentID key) {
+ int chosen = rr;
+ rr = (rr + 1) % numShards;
+ return chosen;
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.contrib.index.mapred.IDistributionPolicy#chooseShardForDelete(org.apache.hadoop.contrib.index.mapred.DocumentID)
+ */
+ public int chooseShardForDelete(DocumentID key) {
+ // -1 represents all the shards
+ return -1;
+ }
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/example/RoundRobinDistributionPolicy.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/FileSystemDirectory.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/FileSystemDirectory.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/FileSystemDirectory.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/FileSystemDirectory.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,349 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.lucene;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.lucene.store.BufferedIndexInput;
+import org.apache.lucene.store.BufferedIndexOutput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.Lock;
+
+/**
+ * This class implements a Lucene Directory on top of a general FileSystem.
+ * Currently it does not support locking.
+ */
+public class FileSystemDirectory extends Directory {
+
+ private final FileSystem fs;
+ private final Path directory;
+ private final int ioFileBufferSize;
+
+ /**
+ * Constructor
+ * @param fs
+ * @param directory
+ * @param create
+ * @param conf
+ * @throws IOException
+ */
+ public FileSystemDirectory(FileSystem fs, Path directory, boolean create,
+ Configuration conf) throws IOException {
+
+ this.fs = fs;
+ this.directory = directory;
+ this.ioFileBufferSize = conf.getInt("io.file.buffer.size", 4096);
+
+ if (create) {
+ create();
+ }
+
+ boolean isDir = false;
+ try {
+ FileStatus status = fs.getFileStatus(directory);
+ if (status != null) {
+ isDir = status.isDir();
+ }
+ } catch (IOException e) {
+ // file does not exist, isDir already set to false
+ }
+ if (!isDir) {
+ throw new IOException(directory + " is not a directory");
+ }
+ }
+
+ private void create() throws IOException {
+ if (!fs.exists(directory)) {
+ fs.mkdirs(directory);
+ }
+
+ boolean isDir = false;
+ try {
+ FileStatus status = fs.getFileStatus(directory);
+ if (status != null) {
+ isDir = status.isDir();
+ }
+ } catch (IOException e) {
+ // file does not exist, isDir already set to false
+ }
+ if (!isDir) {
+ throw new IOException(directory + " is not a directory");
+ }
+
+ // clear old index files
+ FileStatus[] fileStatus =
+ fs.listStatus(directory, LuceneIndexFileNameFilter.getFilter());
+ for (int i = 0; i < fileStatus.length; i++) {
+ if (!fs.delete(fileStatus[i].getPath())) {
+ throw new IOException("Cannot delete index file "
+ + fileStatus[i].getPath());
+ }
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.store.Directory#list()
+ */
+ public String[] list() throws IOException {
+ FileStatus[] fileStatus =
+ fs.listStatus(directory, LuceneIndexFileNameFilter.getFilter());
+ String[] result = new String[fileStatus.length];
+ for (int i = 0; i < fileStatus.length; i++) {
+ result[i] = fileStatus[i].getPath().getName();
+ }
+ return result;
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.store.Directory#fileExists(java.lang.String)
+ */
+ public boolean fileExists(String name) throws IOException {
+ return fs.exists(new Path(directory, name));
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.store.Directory#fileModified(java.lang.String)
+ */
+ public long fileModified(String name) {
+ throw new UnsupportedOperationException();
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.store.Directory#touchFile(java.lang.String)
+ */
+ public void touchFile(String name) {
+ throw new UnsupportedOperationException();
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.store.Directory#fileLength(java.lang.String)
+ */
+ public long fileLength(String name) throws IOException {
+ return fs.getFileStatus(new Path(directory, name)).getLen();
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.store.Directory#deleteFile(java.lang.String)
+ */
+ public void deleteFile(String name) throws IOException {
+ if (!fs.delete(new Path(directory, name))) {
+ throw new IOException("Cannot delete index file " + name);
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.store.Directory#renameFile(java.lang.String, java.lang.String)
+ */
+ public void renameFile(String from, String to) throws IOException {
+ fs.rename(new Path(directory, from), new Path(directory, to));
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.store.Directory#createOutput(java.lang.String)
+ */
+ public IndexOutput createOutput(String name) throws IOException {
+ Path file = new Path(directory, name);
+ if (fs.exists(file) && !fs.delete(file)) {
+ // delete the existing one if applicable
+ throw new IOException("Cannot overwrite index file " + file);
+ }
+
+ return new FileSystemIndexOutput(file, ioFileBufferSize);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.store.Directory#openInput(java.lang.String)
+ */
+ public IndexInput openInput(String name) throws IOException {
+ return openInput(name, ioFileBufferSize);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.store.Directory#openInput(java.lang.String, int)
+ */
+ public IndexInput openInput(String name, int bufferSize) throws IOException {
+ return new FileSystemIndexInput(new Path(directory, name), bufferSize);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.store.Directory#makeLock(java.lang.String)
+ */
+ public Lock makeLock(final String name) {
+ return new Lock() {
+ public boolean obtain() {
+ return true;
+ }
+
+ public void release() {
+ }
+
+ public boolean isLocked() {
+ throw new UnsupportedOperationException();
+ }
+
+ public String toString() {
+ return "Lock@" + new Path(directory, name);
+ }
+ };
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.store.Directory#close()
+ */
+ public void close() throws IOException {
+ // do not close the file system
+ }
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#toString()
+ */
+ public String toString() {
+ return this.getClass().getName() + "@" + directory;
+ }
+
+ private class FileSystemIndexInput extends BufferedIndexInput {
+
+ // shared by clones
+ private class Descriptor {
+ public final FSDataInputStream in;
+ public long position; // cache of in.getPos()
+
+ public Descriptor(Path file, int ioFileBufferSize) throws IOException {
+ this.in = fs.open(file, ioFileBufferSize);
+ }
+ }
+
+ private final Path filePath; // for debugging
+ private final Descriptor descriptor;
+ private final long length;
+ private boolean isOpen;
+ private boolean isClone;
+
+ public FileSystemIndexInput(Path path, int ioFileBufferSize)
+ throws IOException {
+ filePath = path;
+ descriptor = new Descriptor(path, ioFileBufferSize);
+ length = fs.getFileStatus(path).getLen();
+ isOpen = true;
+ }
+
+ protected void readInternal(byte[] b, int offset, int len)
+ throws IOException {
+ synchronized (descriptor) {
+ long position = getFilePointer();
+ if (position != descriptor.position) {
+ descriptor.in.seek(position);
+ descriptor.position = position;
+ }
+ int total = 0;
+ do {
+ int i = descriptor.in.read(b, offset + total, len - total);
+ if (i == -1) {
+ throw new IOException("Read past EOF");
+ }
+ descriptor.position += i;
+ total += i;
+ } while (total < len);
+ }
+ }
+
+ public void close() throws IOException {
+ if (!isClone) {
+ if (isOpen) {
+ descriptor.in.close();
+ isOpen = false;
+ } else {
+ throw new IOException("Index file " + filePath + " already closed");
+ }
+ }
+ }
+
+ protected void seekInternal(long position) {
+ // handled in readInternal()
+ }
+
+ public long length() {
+ return length;
+ }
+
+ protected void finalize() throws IOException {
+ if (!isClone && isOpen) {
+ close(); // close the file
+ }
+ }
+
+ public Object clone() {
+ FileSystemIndexInput clone = (FileSystemIndexInput) super.clone();
+ clone.isClone = true;
+ return clone;
+ }
+ }
+
+ private class FileSystemIndexOutput extends BufferedIndexOutput {
+
+ private final Path filePath; // for debugging
+ private final FSDataOutputStream out;
+ private boolean isOpen;
+
+ public FileSystemIndexOutput(Path path, int ioFileBufferSize)
+ throws IOException {
+ filePath = path;
+ // overwrite is true by default
+ out = fs.create(path, true, ioFileBufferSize);
+ isOpen = true;
+ }
+
+ public void flushBuffer(byte[] b, int offset, int size) throws IOException {
+ out.write(b, offset, size);
+ }
+
+ public void close() throws IOException {
+ if (isOpen) {
+ super.close();
+ out.close();
+ isOpen = false;
+ } else {
+ throw new IOException("Index file " + filePath + " already closed");
+ }
+ }
+
+ public void seek(long pos) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ public long length() throws IOException {
+ return out.getPos();
+ }
+
+ protected void finalize() throws IOException {
+ if (isOpen) {
+ close(); // close the file
+ }
+ }
+ }
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/FileSystemDirectory.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/LuceneIndexFileNameFilter.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/LuceneIndexFileNameFilter.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/LuceneIndexFileNameFilter.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/LuceneIndexFileNameFilter.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.lucene;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.lucene.index.IndexFileNameFilter;
+
+/**
+ * A wrapper class to convert an IndexFileNameFilter which implements
+ * java.io.FilenameFilter to an org.apache.hadoop.fs.PathFilter.
+ */
+class LuceneIndexFileNameFilter implements PathFilter {
+
+ private static final LuceneIndexFileNameFilter singleton =
+ new LuceneIndexFileNameFilter();
+
+ /**
+ * Get a static instance.
+ * @return the static instance
+ */
+ public static LuceneIndexFileNameFilter getFilter() {
+ return singleton;
+ }
+
+ private final IndexFileNameFilter luceneFilter;
+
+ private LuceneIndexFileNameFilter() {
+ luceneFilter = IndexFileNameFilter.getFilter();
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.hadoop.fs.PathFilter#accept(org.apache.hadoop.fs.Path)
+ */
+ public boolean accept(Path path) {
+ return luceneFilter.accept(null, path.getName());
+ }
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/LuceneIndexFileNameFilter.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/LuceneUtil.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/LuceneUtil.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/LuceneUtil.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/LuceneUtil.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,112 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.lucene;
+
+import java.io.IOException;
+
+import org.apache.lucene.store.Directory;
+
+/**
+ * This class copies some methods from Lucene's SegmentInfos since that class
+ * is not public.
+ */
+public final class LuceneUtil {
+
+ static final class IndexFileNames {
+ /** Name of the index segment file */
+ static final String SEGMENTS = "segments";
+
+ /** Name of the generation reference file name */
+ static final String SEGMENTS_GEN = "segments.gen";
+ }
+
+ /**
+ * Check if the file is a segments_N file
+ * @param name
+ * @return true if the file is a segments_N file
+ */
+ public static boolean isSegmentsFile(String name) {
+ return name.startsWith(IndexFileNames.SEGMENTS)
+ && !name.equals(IndexFileNames.SEGMENTS_GEN);
+ }
+
+ /**
+ * Check if the file is the segments.gen file
+ * @param name
+ * @return true if the file is the segments.gen file
+ */
+ public static boolean isSegmentsGenFile(String name) {
+ return name.equals(IndexFileNames.SEGMENTS_GEN);
+ }
+
+ /**
+ * Get the generation (N) of the current segments_N file in the directory.
+ *
+ * @param directory -- directory to search for the latest segments_N file
+ */
+ public static long getCurrentSegmentGeneration(Directory directory)
+ throws IOException {
+ String[] files = directory.list();
+ if (files == null)
+ throw new IOException("cannot read directory " + directory
+ + ": list() returned null");
+ return getCurrentSegmentGeneration(files);
+ }
+
+ /**
+ * Get the generation (N) of the current segments_N file from a list of
+ * files.
+ *
+ * @param files -- array of file names to check
+ */
+ public static long getCurrentSegmentGeneration(String[] files) {
+ if (files == null) {
+ return -1;
+ }
+ long max = -1;
+ for (int i = 0; i < files.length; i++) {
+ String file = files[i];
+ if (file.startsWith(IndexFileNames.SEGMENTS)
+ && !file.equals(IndexFileNames.SEGMENTS_GEN)) {
+ long gen = generationFromSegmentsFileName(file);
+ if (gen > max) {
+ max = gen;
+ }
+ }
+ }
+ return max;
+ }
+
+ /**
+ * Parse the generation off the segments file name and return it.
+ */
+ public static long generationFromSegmentsFileName(String fileName) {
+ if (fileName.equals(IndexFileNames.SEGMENTS)) {
+ return 0;
+ } else if (fileName.startsWith(IndexFileNames.SEGMENTS)) {
+ return Long.parseLong(
+ fileName.substring(1 + IndexFileNames.SEGMENTS.length()),
+ Character.MAX_RADIX);
+ } else {
+ throw new IllegalArgumentException("fileName \"" + fileName
+ + "\" is not a segments file");
+ }
+ }
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/LuceneUtil.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/MixedDeletionPolicy.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/MixedDeletionPolicy.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/MixedDeletionPolicy.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/MixedDeletionPolicy.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,49 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.lucene;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.lucene.index.IndexCommitPoint;
+import org.apache.lucene.index.IndexDeletionPolicy;
+
+/**
+ * For mixed directory. Use KeepAllDeletionPolicy for the read-only directory
+ * (keep all from init) and use KeepOnlyLastCommitDeletionPolicy for the
+ * writable directory (initially empty, keep latest after init).
+ */
+class MixedDeletionPolicy implements IndexDeletionPolicy {
+
+ private int keepAllFromInit = 0;
+
+ public void onInit(List commits) throws IOException {
+ keepAllFromInit = commits.size();
+ }
+
+ public void onCommit(List commits) throws IOException {
+ int size = commits.size();
+ assert (size > keepAllFromInit);
+ // keep all from init and the latest, delete the rest
+ for (int i = keepAllFromInit; i < size - 1; i++) {
+ ((IndexCommitPoint) commits.get(i)).delete();
+ }
+ }
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/MixedDeletionPolicy.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/MixedDirectory.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/MixedDirectory.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/MixedDirectory.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/MixedDirectory.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,185 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.lucene;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.NoLockFactory;
+
+/**
+ * The initial version of an index is stored in a read-only FileSystem dir
+ * (FileSystemDirectory). Index files created by newer versions are written to
+ * a writable local FS dir (Lucene's FSDirectory). We should use the general
+ * FileSystemDirectory for the writable dir as well. But have to use Lucene's
+ * FSDirectory because currently Lucene does randome write and
+ * FileSystemDirectory only supports sequential write.
+ *
+ * Note: We may delete files from the read-only FileSystem dir because there
+ * can be some segment files from an uncommitted checkpoint. For the same
+ * reason, we may create files in the writable dir which already exist in the
+ * read-only dir and logically they overwrite the ones in the read-only dir.
+ */
+class MixedDirectory extends Directory {
+
+ private final Directory readDir; // FileSystemDirectory
+ private final Directory writeDir; // Lucene's FSDirectory
+
+ // take advantage of the fact that Lucene's FSDirectory.fileExists is faster
+
+ public MixedDirectory(FileSystem readFs, Path readPath, FileSystem writeFs,
+ Path writePath, Configuration conf) throws IOException {
+
+ try {
+ readDir = new FileSystemDirectory(readFs, readPath, false, conf);
+ // check writeFS is a local FS?
+ writeDir = FSDirectory.getDirectory(writePath.toString());
+
+ } catch (IOException e) {
+ try {
+ close();
+ } catch (IOException e1) {
+ // ignore this one, throw the original one
+ }
+ throw e;
+ }
+
+ lockFactory = new NoLockFactory();
+ }
+
+ // for debugging
+ MixedDirectory(Directory readDir, Directory writeDir) throws IOException {
+ this.readDir = readDir;
+ this.writeDir = writeDir;
+
+ lockFactory = new NoLockFactory();
+ }
+
+ @Override
+ public String[] list() throws IOException {
+ String[] readFiles = readDir.list();
+ String[] writeFiles = writeDir.list();
+
+ if (readFiles == null || readFiles.length == 0) {
+ return writeFiles;
+ } else if (writeFiles == null || writeFiles.length == 0) {
+ return readFiles;
+ } else {
+ String[] result = new String[readFiles.length + writeFiles.length];
+ System.arraycopy(readFiles, 0, result, 0, readFiles.length);
+ System.arraycopy(writeFiles, 0, result, readFiles.length,
+ writeFiles.length);
+ return result;
+ }
+ }
+
+ @Override
+ public void deleteFile(String name) throws IOException {
+ if (writeDir.fileExists(name)) {
+ writeDir.deleteFile(name);
+ }
+ if (readDir.fileExists(name)) {
+ readDir.deleteFile(name);
+ }
+ }
+
+ @Override
+ public boolean fileExists(String name) throws IOException {
+ return writeDir.fileExists(name) || readDir.fileExists(name);
+ }
+
+ @Override
+ public long fileLength(String name) throws IOException {
+ if (writeDir.fileExists(name)) {
+ return writeDir.fileLength(name);
+ } else {
+ return readDir.fileLength(name);
+ }
+ }
+
+ @Override
+ public long fileModified(String name) throws IOException {
+ if (writeDir.fileExists(name)) {
+ return writeDir.fileModified(name);
+ } else {
+ return readDir.fileModified(name);
+ }
+ }
+
+ @Override
+ public void renameFile(String from, String to) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public void touchFile(String name) throws IOException {
+ if (writeDir.fileExists(name)) {
+ writeDir.touchFile(name);
+ } else {
+ readDir.touchFile(name);
+ }
+ }
+
+ @Override
+ public IndexOutput createOutput(String name) throws IOException {
+ return writeDir.createOutput(name);
+ }
+
+ @Override
+ public IndexInput openInput(String name) throws IOException {
+ if (writeDir.fileExists(name)) {
+ return writeDir.openInput(name);
+ } else {
+ return readDir.openInput(name);
+ }
+ }
+
+ @Override
+ public IndexInput openInput(String name, int bufferSize) throws IOException {
+ if (writeDir.fileExists(name)) {
+ return writeDir.openInput(name, bufferSize);
+ } else {
+ return readDir.openInput(name, bufferSize);
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ try {
+ if (readDir != null) {
+ readDir.close();
+ }
+ } finally {
+ if (writeDir != null) {
+ writeDir.close();
+ }
+ }
+ }
+
+ public String toString() {
+ return this.getClass().getName() + "@" + readDir + "&" + writeDir;
+ }
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/MixedDirectory.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/RAMDirectoryUtil.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/RAMDirectoryUtil.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/RAMDirectoryUtil.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/RAMDirectoryUtil.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,119 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.lucene;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+
+import org.apache.hadoop.io.Text;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.RAMDirectory;
+
+/**
+ * A utility class which writes an index in a ram dir into a DataOutput and
+ * read from a DataInput an index into a ram dir.
+ */
+public class RAMDirectoryUtil {
+ private static final int BUFFER_SIZE = 1024; // RAMOutputStream.BUFFER_SIZE;
+
+ /**
+ * Write a number of files from a ram directory to a data output.
+ * @param out the data output
+ * @param dir the ram directory
+ * @param names the names of the files to write
+ * @throws IOException
+ */
+ public static void writeRAMFiles(DataOutput out, RAMDirectory dir,
+ String[] names) throws IOException {
+ out.writeInt(names.length);
+
+ for (int i = 0; i < names.length; i++) {
+ Text.writeString(out, names[i]);
+ long length = dir.fileLength(names[i]);
+ out.writeLong(length);
+
+ if (length > 0) {
+ // can we avoid the extra copy?
+ IndexInput input = null;
+ try {
+ input = dir.openInput(names[i], BUFFER_SIZE);
+
+ int position = 0;
+ byte[] buffer = new byte[BUFFER_SIZE];
+
+ while (position < length) {
+ int len =
+ position + BUFFER_SIZE <= length ? BUFFER_SIZE
+ : (int) (length - position);
+ input.readBytes(buffer, 0, len);
+ out.write(buffer, 0, len);
+ position += len;
+ }
+ } finally {
+ if (input != null) {
+ input.close();
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Read a number of files from a data input to a ram directory.
+ * @param in the data input
+ * @param dir the ram directory
+ * @throws IOException
+ */
+ public static void readRAMFiles(DataInput in, RAMDirectory dir)
+ throws IOException {
+ int numFiles = in.readInt();
+
+ for (int i = 0; i < numFiles; i++) {
+ String name = Text.readString(in);
+ long length = in.readLong();
+
+ if (length > 0) {
+ // can we avoid the extra copy?
+ IndexOutput output = null;
+ try {
+ output = dir.createOutput(name);
+
+ int position = 0;
+ byte[] buffer = new byte[BUFFER_SIZE];
+
+ while (position < length) {
+ int len =
+ position + BUFFER_SIZE <= length ? BUFFER_SIZE
+ : (int) (length - position);
+ in.readFully(buffer, 0, len);
+ output.writeBytes(buffer, 0, len);
+ position += len;
+ }
+ } finally {
+ if (output != null) {
+ output.close();
+ }
+ }
+ }
+ }
+ }
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/RAMDirectoryUtil.java
------------------------------------------------------------------------------
svn:executable = *
Added: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/ShardWriter.java
URL: http://svn.apache.org/viewvc/hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/ShardWriter.java?rev=639138&view=auto
==============================================================================
--- hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/ShardWriter.java (added)
+++ hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/ShardWriter.java Wed Mar 19 20:33:18 2008
@@ -0,0 +1,233 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.contrib.index.lucene;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.contrib.index.mapred.IndexUpdateConfiguration;
+import org.apache.hadoop.contrib.index.mapred.IntermediateForm;
+import org.apache.hadoop.contrib.index.mapred.Shard;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.store.Directory;
+
+/**
+ * The initial version of an index is stored in the perm dir. Index files
+ * created by newer versions are written to a temp dir on the local FS. After
+ * successfully creating the new version in the temp dir, the shard writer
+ * moves the new files to the perm dir and deletes the temp dir in close().
+ */
+public class ShardWriter {
+ static final Log LOG = LogFactory.getLog(ShardWriter.class);
+
+ private final FileSystem fs;
+ private final FileSystem localFs;
+ private final Path perm;
+ private final Path temp;
+ private final Directory dir;
+ private final IndexWriter writer;
+ private int maxNumSegments;
+ private long numForms = 0;
+
+ /**
+ * Constructor
+ * @param fs
+ * @param shard
+ * @param tempDir
+ * @param iconf
+ * @throws IOException
+ */
+ public ShardWriter(FileSystem fs, Shard shard, String tempDir,
+ IndexUpdateConfiguration iconf) throws IOException {
+ LOG.info("Construct a shard writer");
+
+ this.fs = fs;
+ localFs = FileSystem.getLocal(iconf.getConfiguration());
+ perm = new Path(shard.getDirectory());
+ temp = new Path(tempDir);
+
+ long initGeneration = shard.getGeneration();
+ if (!fs.exists(perm)) {
+ assert (initGeneration < 0);
+ fs.mkdirs(perm);
+ } else {
+ restoreGeneration(fs, perm, initGeneration);
+ }
+ dir =
+ new MixedDirectory(fs, perm, localFs, fs.startLocalOutput(perm, temp),
+ iconf.getConfiguration());
+
+ // analyzer is null because we only use addIndexes, not addDocument
+ writer =
+ new IndexWriter(dir, false, null,
+ initGeneration < 0 ? new KeepOnlyLastCommitDeletionPolicy()
+ : new MixedDeletionPolicy());
+ setParameters(iconf);
+ }
+
+ /**
+ * Process an intermediate form by carrying out, on the Lucene instance of
+ * the shard, the deletes and the inserts (a ram index) in the form.
+ * @param form the intermediate form containing deletes and a ram index
+ * @throws IOException
+ */
+ public void process(IntermediateForm form) throws IOException {
+ // first delete
+ Iterator<Term> iter = form.deleteTermIterator();
+ while (iter.hasNext()) {
+ writer.deleteDocuments(iter.next());
+ }
+ // then insert
+ writer.addIndexesNoOptimize(new Directory[] { form.getDirectory() });
+ numForms++;
+ }
+
+ /**
+ * Close the shard writer. Optimize the Lucene instance of the shard before
+ * closing if necessary, and copy the files created in the temp directory
+ * to the permanent directory after closing.
+ * @throws IOException
+ */
+ public void close() throws IOException {
+ LOG.info("Closing the shard writer, processed " + numForms + " forms");
+ try {
+ try {
+ if (maxNumSegments > 0) {
+ writer.optimize(maxNumSegments);
+ LOG.info("Optimized the shard into at most " + maxNumSegments
+ + " segments");
+ }
+ } finally {
+ writer.close();
+ LOG.info("Closed Lucene index writer");
+ }
+
+ moveFromTempToPerm();
+ LOG.info("Moved new index files to " + perm);
+
+ } finally {
+ dir.close();
+ LOG.info("Closed the shard writer");
+ }
+ }
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#toString()
+ */
+ public String toString() {
+ return this.getClass().getName() + "@" + perm + "&" + temp;
+ }
+
+ private void setParameters(IndexUpdateConfiguration iconf) {
+ int maxFieldLength = iconf.getIndexMaxFieldLength();
+ if (maxFieldLength > 0) {
+ writer.setMaxFieldLength(maxFieldLength);
+ }
+ writer.setUseCompoundFile(iconf.getIndexUseCompoundFile());
+ maxNumSegments = iconf.getIndexMaxNumSegments();
+
+ if (maxFieldLength > 0) {
+ LOG.info("sea.max.field.length = " + writer.getMaxFieldLength());
+ }
+ LOG.info("sea.use.compound.file = " + writer.getUseCompoundFile());
+ LOG.info("sea.max.num.segments = " + maxNumSegments);
+ }
+
+ // in case a previous reduce task fails, restore the generation to
+ // the original starting point by deleting the segments.gen file
+ // and the segments_N files whose generations are greater than the
+ // starting generation; rest of the unwanted files will be deleted
+ // once the unwanted segments_N files are deleted
+ private void restoreGeneration(FileSystem fs, Path perm, long startGen)
+ throws IOException {
+
+ FileStatus[] fileStatus = fs.listStatus(perm, new PathFilter() {
+ public boolean accept(Path path) {
+ return LuceneUtil.isSegmentsFile(path.getName());
+ }
+ });
+
+ // remove the segments_N files whose generation are greater than
+ // the starting generation
+ for (int i = 0; i < fileStatus.length; i++) {
+ Path path = fileStatus[i].getPath();
+ if (startGen < LuceneUtil.generationFromSegmentsFileName(path.getName())) {
+ fs.delete(path);
+ }
+ }
+
+ // always remove segments.gen in case last failed try removed segments_N
+ // but not segments.gen, and segments.gen will be overwritten anyway.
+ Path segmentsGenFile = new Path(LuceneUtil.IndexFileNames.SEGMENTS_GEN);
+ if (fs.exists(segmentsGenFile)) {
+ fs.delete(segmentsGenFile);
+ }
+ }
+
+ // move the files created in the temp dir into the perm dir
+ // and then delete the temp dir from the local FS
+ private void moveFromTempToPerm() throws IOException {
+ try {
+ FileStatus[] fileStatus =
+ localFs.listStatus(temp, LuceneIndexFileNameFilter.getFilter());
+ Path segmentsPath = null;
+ Path segmentsGenPath = null;
+
+ // move the files created in temp dir except segments_N and segments.gen
+ for (int i = 0; i < fileStatus.length; i++) {
+ Path path = fileStatus[i].getPath();
+ String name = path.getName();
+
+ if (LuceneUtil.isSegmentsGenFile(name)) {
+ assert (segmentsGenPath == null);
+ segmentsGenPath = path;
+ } else if (LuceneUtil.isSegmentsFile(name)) {
+ assert (segmentsPath == null);
+ segmentsPath = path;
+ } else {
+ fs.completeLocalOutput(new Path(perm, name), path);
+ }
+ }
+
+ // move the segments_N file
+ if (segmentsPath != null) {
+ fs.completeLocalOutput(new Path(perm, segmentsPath.getName()),
+ segmentsPath);
+ }
+
+ // move the segments.gen file
+ if (segmentsGenPath != null) {
+ fs.completeLocalOutput(new Path(perm, segmentsGenPath.getName()),
+ segmentsGenPath);
+ }
+ } finally {
+ // finally delete the temp dir (files should have been deleted)
+ localFs.delete(temp);
+ }
+ }
+
+}
Propchange: hadoop/core/trunk/src/contrib/index/src/java/org/apache/hadoop/contrib/index/lucene/ShardWriter.java
------------------------------------------------------------------------------
svn:executable = *