You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@accumulo.apache.org by ec...@apache.org on 2012/01/12 17:06:20 UTC
svn commit: r1230608 [6/16] - in /incubator/accumulo/trunk: ./ contrib/accumulo_sample/ src/assemble/ src/core/ src/core/src/main/java/org/apache/accumulo/core/client/impl/thrift/ src/core/src/main/java/org/apache/accumulo/core/master/thrift/ src/core/...

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/LfLineReader.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/LfLineReader.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/LfLineReader.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/LfLineReader.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.reader;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+
+/**
+ * A class that provides a line reader from an input stream.
+ */
+public class LfLineReader {
+  private static final int DEFAULT_BUFFER_SIZE = 64 * 1024;
+  private int bufferSize = DEFAULT_BUFFER_SIZE;
+  private InputStream in;
+  private byte[] buffer;
+  // the number of bytes of real data in the buffer
+  private int bufferLength = 0;
+  // the current position in the buffer
+  private int bufferPosn = 0;
+  
+  private static final byte LF = '\n';
+  
+  /**
+   * Create a line reader that reads from the given stream using the default buffer-size (64k).
+   * 
+   * @param in
+   *          The input stream
+   * @throws IOException
+   */
+  public LfLineReader(InputStream in) {
+    this(in, DEFAULT_BUFFER_SIZE);
+  }
+  
+  /**
+   * Create a line reader that reads from the given stream using the given buffer-size.
+   * 
+   * @param in
+   *          The input stream
+   * @param bufferSize
+   *          Size of the read buffer
+   * @throws IOException
+   */
+  public LfLineReader(InputStream in, int bufferSize) {
+    this.in = in;
+    this.bufferSize = bufferSize;
+    this.buffer = new byte[this.bufferSize];
+  }
+  
+  /**
+   * Create a line reader that reads from the given stream using the <code>io.file.buffer.size</code> specified in the given <code>Configuration</code>.
+   * 
+   * @param in
+   *          input stream
+   * @param conf
+   *          configuration
+   * @throws IOException
+   */
+  public LfLineReader(InputStream in, Configuration conf) throws IOException {
+    this(in, conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE));
+  }
+  
+  /**
+   * Close the underlying stream.
+   * 
+   * @throws IOException
+   */
+  public void close() throws IOException {
+    in.close();
+  }
+  
+  /**
+   * Read one line from the InputStream into the given Text. A line can be terminated by '\n' (LF). EOF also terminates an otherwise unterminated line.
+   * 
+   * @param str
+   *          the object to store the given line (without newline)
+   * @param maxLineLength
+   *          the maximum number of bytes to store into str; the rest of the line is silently discarded.
+   * @param maxBytesToConsume
+   *          the maximum number of bytes to consume in this call. This is only a hint, because if the line cross this threshold, we allow it to happen. It can
+   *          overshoot potentially by as much as one buffer length.
+   * 
+   * @return the number of bytes read including the (longest) newline found.
+   * 
+   * @throws IOException
+   *           if the underlying stream throws
+   */
+  public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
+    /*
+     * We're reading data from in, but the head of the stream may be already buffered in buffer, so we have several cases: 1. No newline characters are in the
+     * buffer, so we need to copy everything and read another buffer from the stream. 2. An unambiguously terminated line is in buffer, so we just copy to str.
+     */
+    str.clear();
+    int txtLength = 0; // tracks str.getLength(), as an optimization
+    int newlineLength = 0; // length of terminating newline
+    long bytesConsumed = 0;
+    do {
+      int startPosn = bufferPosn; // starting from where we left off the last time
+      if (bufferPosn >= bufferLength) {
+        startPosn = bufferPosn = 0;
+        bufferLength = in.read(buffer);
+        if (bufferLength <= 0)
+          break; // EOF
+      }
+      for (; bufferPosn < bufferLength; ++bufferPosn) { // search for newline
+        if (buffer[bufferPosn] == LF) {
+          newlineLength = 1;
+          ++bufferPosn; // at next invocation proceed from following byte
+          break;
+        }
+      }
+      int readLength = bufferPosn - startPosn;
+      bytesConsumed += readLength;
+      int appendLength = readLength - newlineLength;
+      if (appendLength > maxLineLength - txtLength) {
+        appendLength = maxLineLength - txtLength;
+      }
+      if (appendLength > 0) {
+        str.append(buffer, startPosn, appendLength);
+        txtLength += appendLength;
+      }
+    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);
+    
+    if (bytesConsumed > Integer.MAX_VALUE)
+      throw new IOException("Too many bytes before newline: " + bytesConsumed);
+    return (int) bytesConsumed;
+  }
+  
+  /**
+   * Read from the InputStream into the given Text.
+   * 
+   * @param str
+   *          the object to store the given line
+   * @param maxLineLength
+   *          the maximum number of bytes to store into str.
+   * @return the number of bytes read including the newline
+   * @throws IOException
+   *           if the underlying stream throws
+   */
+  public int readLine(Text str, int maxLineLength) throws IOException {
+    return readLine(str, maxLineLength, Integer.MAX_VALUE);
+  }
+  
+  /**
+   * Read from the InputStream into the given Text.
+   * 
+   * @param str
+   *          the object to store the given line
+   * @return the number of bytes read including the newline
+   * @throws IOException
+   *           if the underlying stream throws
+   */
+  public int readLine(Text str) throws IOException {
+    return readLine(str, Integer.MAX_VALUE, Integer.MAX_VALUE);
+  }
+  
+}

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/LfLineReader.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/LongLineRecordReader.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/LongLineRecordReader.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/LongLineRecordReader.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/LongLineRecordReader.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.reader;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.CompressionCodecFactory;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
+import org.apache.hadoop.util.LineReader;
+
+/**
+ * A copy of {@link LineRecordReader} which does not discard lines longer than "mapred.linerecordreader.maxlength". Instead, it returns them, leaving it to the
+ * mapper to decide what to do with it. It also does not treat '\r' (CR) characters as new lines -- it uses {@link LfLineReader} instead of {@link LineReader}
+ * to read lines.
+ */
+public class LongLineRecordReader extends RecordReader<LongWritable,Text> {
+  private CompressionCodecFactory compressionCodecs = null;
+  private long start;
+  private long pos;
+  private long end;
+  private LfLineReader in;
+  private int maxLineLength;
+  private LongWritable key = null;
+  private Text value = null;
+  
+  @Override
+  public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
+    FileSplit split = (FileSplit) genericSplit;
+    Configuration job = context.getConfiguration();
+    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
+    start = split.getStart();
+    end = start + split.getLength();
+    final Path file = split.getPath();
+    compressionCodecs = new CompressionCodecFactory(job);
+    final CompressionCodec codec = compressionCodecs.getCodec(file);
+    
+    // open the file and seek to the start of the split
+    FileSystem fs = file.getFileSystem(job);
+    FSDataInputStream fileIn = fs.open(split.getPath());
+    boolean skipFirstLine = false;
+    if (codec != null) {
+      in = new LfLineReader(codec.createInputStream(fileIn), job);
+      end = Long.MAX_VALUE;
+    } else {
+      if (start != 0) {
+        skipFirstLine = true;
+        --start;
+        fileIn.seek(start);
+      }
+      in = new LfLineReader(fileIn, job);
+    }
+    if (skipFirstLine) { // skip first line and re-establish "start".
+      start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
+    }
+    this.pos = start;
+  }
+  
+  @Override
+  public boolean nextKeyValue() throws IOException {
+    if (key == null) {
+      key = new LongWritable();
+    }
+    key.set(pos);
+    if (value == null) {
+      value = new Text();
+    }
+    int newSize = 0;
+    if (pos < end) {
+      newSize = in.readLine(value, maxLineLength, Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength));
+      if (newSize != 0) {
+        pos += newSize;
+      }
+    }
+    if (newSize == 0) {
+      key = null;
+      value = null;
+      return false;
+    } else {
+      return true;
+    }
+  }
+  
+  @Override
+  public LongWritable getCurrentKey() {
+    return key;
+  }
+  
+  @Override
+  public Text getCurrentValue() {
+    return value;
+  }
+  
+  /**
+   * Get the progress within the split
+   */
+  @Override
+  public float getProgress() {
+    if (start == end) {
+      return 0.0f;
+    } else {
+      return Math.min(1.0f, (pos - start) / (float) (end - start));
+    }
+  }
+  
+  @Override
+  public synchronized void close() throws IOException {
+    if (in != null) {
+      in.close();
+    }
+  }
+}

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/LongLineRecordReader.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/util/TextUtil.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/util/TextUtil.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/util/TextUtil.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/util/TextUtil.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.util;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import org.apache.hadoop.io.Text;
+import org.apache.accumulo.core.iterators.aggregation.LongSummation;
+
+public class TextUtil {
+  
+  /**
+   * Appends a null byte followed by the UTF-8 bytes of the given string to the given {@link Text}
+   * 
+   * @param text
+   *          the Text to which to append
+   * @param string
+   *          the String to append
+   */
+  public static void textAppend(Text text, String string) {
+    appendNullByte(text);
+    textAppendNoNull(text, string);
+  }
+  
+  public static void textAppend(Text text, String string, boolean replaceBadChar) {
+    appendNullByte(text);
+    textAppendNoNull(text, string, replaceBadChar);
+  }
+  
+  public static void textAppend(Text t, long s) {
+    t.append(nullByte, 0, 1);
+    t.append(LongSummation.longToBytes(s), 0, 8);
+  }
+  
+  private static final byte[] nullByte = {0};
+  
+  /**
+   * Appends a null byte to the given text
+   * 
+   * @param text
+   *          the text to which to append the null byte
+   */
+  public static void appendNullByte(Text text) {
+    text.append(nullByte, 0, nullByte.length);
+  }
+  
+  /**
+   * Appends the UTF-8 bytes of the given string to the given {@link Text}
+   * 
+   * @param text
+   *          the Text to which to append
+   * @param string
+   *          the String to append
+   */
+  public static void textAppendNoNull(Text t, String s) {
+    textAppendNoNull(t, s, false);
+  }
+  
+  /**
+   * Appends the UTF-8 bytes of the given string to the given {@link Text}
+   * 
+   * @param t
+   * @param s
+   * @param replaceBadChar
+   */
+  public static void textAppendNoNull(Text t, String s, boolean replaceBadChar) {
+    try {
+      ByteBuffer buffer = Text.encode(s, replaceBadChar);
+      t.append(buffer.array(), 0, buffer.limit());
+    } catch (CharacterCodingException cce) {
+      throw new IllegalArgumentException(cce);
+    }
+  }
+  
+  /**
+   * Converts the given string its UTF-8 bytes. This uses Hadoop's method for converting string to UTF-8 and is much faster than calling
+   * {@link String#getBytes(String)}.
+   * 
+   * @param string
+   *          the string to convert
+   * @return the UTF-8 representation of the string
+   */
+  public static byte[] toUtf8(String string) {
+    ByteBuffer buffer;
+    try {
+      buffer = Text.encode(string, false);
+    } catch (CharacterCodingException cce) {
+      throw new IllegalArgumentException(cce);
+    }
+    byte[] bytes = new byte[buffer.limit()];
+    System.arraycopy(buffer.array(), 0, bytes, 0, bytes.length);
+    return bytes;
+  }
+}

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/util/TextUtil.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/protobuf/TermWeight.proto
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/protobuf/TermWeight.proto?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/protobuf/TermWeight.proto (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/protobuf/TermWeight.proto Thu Jan 12 16:06:14 2012
@@ -0,0 +1,28 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// compile with protoc --java_out ../java
+// compile extra builder util with java accumulo.data.protobuf.builder.ProtoBufBuilder -d ../java accumulo.data.protobuf.UidList
+//      classpath for compile command should include ../../../target/classes and protobuf-java-2.2.0.jar
+
+package protobuf;
+
+option java_package = "protobuf";
+option optimize_for = SPEED;
+
+message Info {
+	required float normalizedTermFrequency = 1;
+	repeated uint32 wordOffset = 2;
+}

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/protobuf/Uid.proto
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/protobuf/Uid.proto?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/protobuf/Uid.proto (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/protobuf/Uid.proto Thu Jan 12 16:06:14 2012
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// compile with protoc --java_out ../java
+// compile extra builder util with java accumulo.data.protobuf.builder.ProtoBufBuilder -d ../java accumulo.data.protobuf.UidList
+//      classpath for compile command should include ../../../target/classes and protobuf-java-2.2.0.jar
+
+package protobuf;
+
+option java_package = "protobuf";
+option optimize_for = SPEED;
+
+message List {
+  required bool IGNORE = 1;
+  required uint64 COUNT = 2;
+  repeated string UID = 3;
+}

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/protobuf/compile_protos.sh
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/protobuf/compile_protos.sh?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/protobuf/compile_protos.sh (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/protobuf/compile_protos.sh Thu Jan 12 16:06:14 2012
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+for PROTO in `ls -1 *proto`; do protoc --java_out ../java $PROTO; done

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/protobuf/compile_protos.sh
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/protobuf/compile_protos.sh
------------------------------------------------------------------------------
    svn:executable = *

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/aggregator/GlobalIndexUidAggregatorTest.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/aggregator/GlobalIndexUidAggregatorTest.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/aggregator/GlobalIndexUidAggregatorTest.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/aggregator/GlobalIndexUidAggregatorTest.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,178 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.aggregator;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.UUID;
+
+import junit.framework.TestCase;
+
+import org.apache.accumulo.core.data.Value;
+import org.apache.accumulo.core.iterators.aggregation.Aggregator;
+import org.apache.accumulo.examples.wikisearch.aggregator.GlobalIndexUidAggregator;
+import org.apache.accumulo.examples.wikisearch.protobuf.Uid;
+import org.apache.accumulo.examples.wikisearch.protobuf.Uid.List.Builder;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+
+
+@SuppressWarnings("deprecation")
+public class GlobalIndexUidAggregatorTest extends TestCase {
+  
+  Aggregator agg = new GlobalIndexUidAggregator();
+  
+  private Uid.List.Builder createNewUidList() {
+    return Uid.List.newBuilder();
+  }
+  
+  public void testSingleUid() {
+    agg.reset();
+    Builder b = createNewUidList();
+    b.setCOUNT(1);
+    b.setIGNORE(false);
+    b.addUID(UUID.randomUUID().toString());
+    Uid.List uidList = b.build();
+    Value val = new Value(uidList.toByteArray());
+    agg.collect(val);
+    Value result = agg.aggregate();
+    assertTrue(val.compareTo(result.get()) == 0);
+  }
+  
+  public void testLessThanMax() throws Exception {
+    agg.reset();
+    List<String> savedUUIDs = new ArrayList<String>();
+    for (int i = 0; i < GlobalIndexUidAggregator.MAX - 1; i++) {
+      Builder b = createNewUidList();
+      b.setIGNORE(false);
+      String uuid = UUID.randomUUID().toString();
+      savedUUIDs.add(uuid);
+      b.setCOUNT(i);
+      b.addUID(uuid);
+      Uid.List uidList = b.build();
+      Value val = new Value(uidList.toByteArray());
+      agg.collect(val);
+    }
+    Value result = agg.aggregate();
+    Uid.List resultList = Uid.List.parseFrom(result.get());
+    assertTrue(resultList.getIGNORE() == false);
+    assertTrue(resultList.getUIDCount() == (GlobalIndexUidAggregator.MAX - 1));
+    List<String> resultListUUIDs = resultList.getUIDList();
+    for (String s : savedUUIDs)
+      assertTrue(resultListUUIDs.contains(s));
+  }
+  
+  public void testEqualsMax() throws Exception {
+    agg.reset();
+    List<String> savedUUIDs = new ArrayList<String>();
+    for (int i = 0; i < GlobalIndexUidAggregator.MAX; i++) {
+      Builder b = createNewUidList();
+      b.setIGNORE(false);
+      String uuid = UUID.randomUUID().toString();
+      savedUUIDs.add(uuid);
+      b.setCOUNT(i);
+      b.addUID(uuid);
+      Uid.List uidList = b.build();
+      Value val = new Value(uidList.toByteArray());
+      agg.collect(val);
+    }
+    Value result = agg.aggregate();
+    Uid.List resultList = Uid.List.parseFrom(result.get());
+    assertTrue(resultList.getIGNORE() == false);
+    assertTrue(resultList.getUIDCount() == (GlobalIndexUidAggregator.MAX));
+    List<String> resultListUUIDs = resultList.getUIDList();
+    for (String s : savedUUIDs)
+      assertTrue(resultListUUIDs.contains(s));
+  }
+  
+  public void testMoreThanMax() throws Exception {
+    agg.reset();
+    List<String> savedUUIDs = new ArrayList<String>();
+    for (int i = 0; i < GlobalIndexUidAggregator.MAX + 10; i++) {
+      Builder b = createNewUidList();
+      b.setIGNORE(false);
+      String uuid = UUID.randomUUID().toString();
+      savedUUIDs.add(uuid);
+      b.setCOUNT(1);
+      b.addUID(uuid);
+      Uid.List uidList = b.build();
+      Value val = new Value(uidList.toByteArray());
+      agg.collect(val);
+    }
+    Value result = agg.aggregate();
+    Uid.List resultList = Uid.List.parseFrom(result.get());
+    assertTrue(resultList.getIGNORE() == true);
+    assertTrue(resultList.getUIDCount() == 0);
+    assertTrue(resultList.getCOUNT() == (GlobalIndexUidAggregator.MAX + 10));
+  }
+  
+  public void testSeenIgnore() throws Exception {
+    agg.reset();
+    Builder b = createNewUidList();
+    b.setIGNORE(true);
+    b.setCOUNT(0);
+    Uid.List uidList = b.build();
+    Value val = new Value(uidList.toByteArray());
+    agg.collect(val);
+    b = createNewUidList();
+    b.setIGNORE(false);
+    b.setCOUNT(1);
+    b.addUID(UUID.randomUUID().toString());
+    uidList = b.build();
+    val = new Value(uidList.toByteArray());
+    agg.collect(val);
+    Value result = agg.aggregate();
+    Uid.List resultList = Uid.List.parseFrom(result.get());
+    assertTrue(resultList.getIGNORE() == true);
+    assertTrue(resultList.getUIDCount() == 0);
+    assertTrue(resultList.getCOUNT() == 1);
+  }
+  
+  public void testInvalidValueType() throws Exception {
+    Logger.getLogger(GlobalIndexUidAggregator.class).setLevel(Level.OFF);
+    agg.reset();
+    Value val = new Value(UUID.randomUUID().toString().getBytes());
+    agg.collect(val);
+    Value result = agg.aggregate();
+    Uid.List resultList = Uid.List.parseFrom(result.get());
+    assertTrue(resultList.getIGNORE() == false);
+    assertTrue(resultList.getUIDCount() == 0);
+    assertTrue(resultList.getCOUNT() == 0);
+  }
+  
+  public void testCount() throws Exception {
+    agg.reset();
+    UUID uuid = UUID.randomUUID();
+    // Collect the same UUID five times.
+    for (int i = 0; i < 5; i++) {
+      Builder b = createNewUidList();
+      b.setCOUNT(1);
+      b.setIGNORE(false);
+      b.addUID(uuid.toString());
+      Uid.List uidList = b.build();
+      Value val = new Value(uidList.toByteArray());
+      agg.collect(val);
+    }
+    Value result = agg.aggregate();
+    Uid.List resultList = Uid.List.parseFrom(result.get());
+    assertTrue(resultList.getIGNORE() == false);
+    assertTrue(resultList.getUIDCount() == 1);
+    assertTrue(resultList.getCOUNT() == 5);
+    
+  }
+  
+}

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/aggregator/GlobalIndexUidAggregatorTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/aggregator/TextIndexAggregatorTest.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/aggregator/TextIndexAggregatorTest.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/aggregator/TextIndexAggregatorTest.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/aggregator/TextIndexAggregatorTest.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.aggregator;
+
+import java.util.List;
+
+import junit.framework.Assert;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import org.apache.accumulo.core.data.Value;
+import org.apache.accumulo.examples.wikisearch.aggregator.TextIndexAggregator;
+import org.apache.accumulo.examples.wikisearch.protobuf.TermWeight;
+import org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info.Builder;
+
+import com.google.protobuf.InvalidProtocolBufferException;
+
+public class TextIndexAggregatorTest {
+  private TextIndexAggregator aggregator;
+  
+  @Before
+  public void setup() throws Exception {
+    aggregator = new TextIndexAggregator();
+  }
+  
+  @After
+  public void cleanup() {
+    
+  }
+  
+  private TermWeight.Info.Builder createBuilder() {
+    return TermWeight.Info.newBuilder();
+  }
+  
+  @Test
+  public void testSingleValue() throws InvalidProtocolBufferException {
+    aggregator = new TextIndexAggregator();
+    Builder builder = createBuilder();
+    builder.addWordOffset(1);
+    builder.addWordOffset(5);
+    builder.setNormalizedTermFrequency(0.1f);
+    
+    aggregator.collect(new Value(builder.build().toByteArray()));
+    
+    Value result = aggregator.aggregate();
+    
+    TermWeight.Info info = TermWeight.Info.parseFrom(result.get());
+    
+    Assert.assertTrue(info.getNormalizedTermFrequency() == 0.1f);
+    
+    List<Integer> offsets = info.getWordOffsetList();
+    Assert.assertTrue(offsets.size() == 2);
+    Assert.assertTrue(offsets.get(0) == 1);
+    Assert.assertTrue(offsets.get(1) == 5);
+  }
+  
+  @Test
+  public void testAggregateTwoValues() throws InvalidProtocolBufferException {
+    aggregator = new TextIndexAggregator();
+    Builder builder = createBuilder();
+    builder.addWordOffset(1);
+    builder.addWordOffset(5);
+    builder.setNormalizedTermFrequency(0.1f);
+    
+    aggregator.collect(new Value(builder.build().toByteArray()));
+    
+    builder = createBuilder();
+    builder.addWordOffset(3);
+    builder.setNormalizedTermFrequency(0.05f);
+    
+    aggregator.collect(new Value(builder.build().toByteArray()));
+    
+    Value result = aggregator.aggregate();
+    
+    TermWeight.Info info = TermWeight.Info.parseFrom(result.get());
+    
+    Assert.assertTrue(info.getNormalizedTermFrequency() == 0.15f);
+    
+    List<Integer> offsets = info.getWordOffsetList();
+    Assert.assertTrue(offsets.size() == 3);
+    Assert.assertTrue(offsets.get(0) == 1);
+    Assert.assertTrue(offsets.get(1) == 3);
+    Assert.assertTrue(offsets.get(2) == 5);
+  }
+  
+  @Test
+  public void testAggregateManyValues() throws InvalidProtocolBufferException {
+    aggregator = new TextIndexAggregator();
+    
+    Builder builder = createBuilder();
+    builder.addWordOffset(13);
+    builder.addWordOffset(15);
+    builder.addWordOffset(19);
+    builder.setNormalizedTermFrequency(0.12f);
+    
+    aggregator.collect(new Value(builder.build().toByteArray()));
+    
+    builder = createBuilder();
+    builder.addWordOffset(1);
+    builder.addWordOffset(5);
+    builder.setNormalizedTermFrequency(0.1f);
+    
+    aggregator.collect(new Value(builder.build().toByteArray()));
+    
+    builder = createBuilder();
+    builder.addWordOffset(3);
+    builder.setNormalizedTermFrequency(0.05f);
+    
+    aggregator.collect(new Value(builder.build().toByteArray()));
+    
+    Value result = aggregator.aggregate();
+    
+    TermWeight.Info info = TermWeight.Info.parseFrom(result.get());
+    
+    Assert.assertTrue(info.getNormalizedTermFrequency() == 0.27f);
+    
+    List<Integer> offsets = info.getWordOffsetList();
+    Assert.assertTrue(offsets.size() == 6);
+    Assert.assertTrue(offsets.get(0) == 1);
+    Assert.assertTrue(offsets.get(1) == 3);
+    Assert.assertTrue(offsets.get(2) == 5);
+    Assert.assertTrue(offsets.get(3) == 13);
+    Assert.assertTrue(offsets.get(4) == 15);
+    Assert.assertTrue(offsets.get(5) == 19);
+  }
+}

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/aggregator/TextIndexAggregatorTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/ingest/StandaloneStatusReporter.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/ingest/StandaloneStatusReporter.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/ingest/StandaloneStatusReporter.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/ingest/StandaloneStatusReporter.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.ingest;
+
+import org.apache.hadoop.mapreduce.Counter;
+import org.apache.hadoop.mapreduce.Counters;
+import org.apache.hadoop.mapreduce.StatusReporter;
+
+public class StandaloneStatusReporter extends StatusReporter {
+  
+  private Counters c = new Counters();
+  
+  private long filesProcessed = 0;
+  private long recordsProcessed = 0;
+  
+  public Counters getCounters() {
+    return c;
+  }
+  
+  @Override
+  public Counter getCounter(Enum<?> name) {
+    return c.findCounter(name);
+  }
+  
+  @Override
+  public Counter getCounter(String group, String name) {
+    return c.findCounter(group, name);
+  }
+  
+  @Override
+  public void progress() {
+    // do nothing
+  }
+  
+  @Override
+  public void setStatus(String status) {
+    // do nothing
+  }
+  
+  public long getFilesProcessed() {
+    return filesProcessed;
+  }
+  
+  public long getRecordsProcessed() {
+    return recordsProcessed;
+  }
+  
+  public void incrementFilesProcessed() {
+    filesProcessed++;
+    recordsProcessed = 0;
+  }
+  
+  public void incrementRecordsProcessed() {
+    recordsProcessed++;
+  }
+}

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/ingest/StandaloneStatusReporter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapperTest.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapperTest.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapperTest.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapperTest.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.ingest;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URL;
+import java.util.HashMap;
+import java.util.Map.Entry;
+
+import junit.framework.Assert;
+
+import org.apache.accumulo.core.client.BatchWriter;
+import org.apache.accumulo.core.client.Connector;
+import org.apache.accumulo.core.client.MutationsRejectedException;
+import org.apache.accumulo.core.client.Scanner;
+import org.apache.accumulo.core.client.mock.MockInstance;
+import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.Mutation;
+import org.apache.accumulo.core.data.Range;
+import org.apache.accumulo.core.data.Value;
+import org.apache.accumulo.core.security.Authorizations;
+import org.apache.accumulo.examples.wikisearch.ingest.WikipediaConfiguration;
+import org.apache.accumulo.examples.wikisearch.ingest.WikipediaMapper;
+import org.apache.accumulo.examples.wikisearch.reader.AggregatingRecordReader;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RawLocalFileSystem;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.OutputCommitter;
+import org.apache.hadoop.mapreduce.RecordWriter;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.TaskAttemptID;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
+import org.junit.Before;
+
+/**
+ * Load some data into mock accumulo
+ */
+public class WikipediaMapperTest {
+  
+  private static final String METADATA_TABLE_NAME = "wikiMetadata";
+  
+  private static final String TABLE_NAME = "wiki";
+  
+  private static final String INDEX_TABLE_NAME = "wikiIndex";
+  
+  private static final String RINDEX_TABLE_NAME = "wikiReverseIndex";
+  
+  private class MockAccumuloRecordWriter extends RecordWriter<Text,Mutation> {
+    @Override
+    public void write(Text key, Mutation value) throws IOException, InterruptedException {
+      try {
+        writerMap.get(key).addMutation(value);
+      } catch (MutationsRejectedException e) {
+        throw new IOException("Error adding mutation", e);
+      }
+    }
+    
+    @Override
+    public void close(TaskAttemptContext context) throws IOException, InterruptedException {
+      try {
+        for (BatchWriter w : writerMap.values()) {
+          w.flush();
+          w.close();
+        }
+      } catch (MutationsRejectedException e) {
+        throw new IOException("Error closing Batch Writer", e);
+      }
+    }
+    
+  }
+  
+  private Connector c = null;
+  private Configuration conf = new Configuration();
+  private HashMap<Text,BatchWriter> writerMap = new HashMap<Text,BatchWriter>();
+  
+  @Before
+  public void setup() throws Exception {
+    
+    conf.set(AggregatingRecordReader.START_TOKEN, "<page>");
+    conf.set(AggregatingRecordReader.END_TOKEN, "</page>");
+    conf.set(WikipediaConfiguration.TABLE_NAME, TABLE_NAME);
+    conf.set(WikipediaConfiguration.NUM_PARTITIONS, "1");
+    
+    MockInstance i = new MockInstance();
+    c = i.getConnector("root", "pass");
+    c.tableOperations().delete(METADATA_TABLE_NAME);
+    c.tableOperations().delete(TABLE_NAME);
+    c.tableOperations().delete(INDEX_TABLE_NAME);
+    c.tableOperations().delete(RINDEX_TABLE_NAME);
+    c.tableOperations().create(METADATA_TABLE_NAME);
+    c.tableOperations().create(TABLE_NAME);
+    c.tableOperations().create(INDEX_TABLE_NAME);
+    c.tableOperations().create(RINDEX_TABLE_NAME);
+    
+    writerMap.put(new Text(METADATA_TABLE_NAME), c.createBatchWriter(METADATA_TABLE_NAME, 1000L, 1000L, 1));
+    writerMap.put(new Text(TABLE_NAME), c.createBatchWriter(TABLE_NAME, 1000L, 1000L, 1));
+    writerMap.put(new Text(INDEX_TABLE_NAME), c.createBatchWriter(INDEX_TABLE_NAME, 1000L, 1000L, 1));
+    writerMap.put(new Text(RINDEX_TABLE_NAME), c.createBatchWriter(RINDEX_TABLE_NAME, 1000L, 1000L, 1));
+    
+    TaskAttemptID id = new TaskAttemptID();
+    TaskAttemptContext context = new TaskAttemptContext(conf, id);
+    
+    RawLocalFileSystem fs = new RawLocalFileSystem();
+    fs.setConf(conf);
+    
+    URL url = ClassLoader.getSystemResource("enwiki-20110901-001.xml");
+    Assert.assertNotNull(url);
+    File data = new File(url.toURI());
+    Path tmpFile = new Path(data.getAbsolutePath());
+    
+    // Setup the Mapper
+    InputSplit split = new FileSplit(tmpFile, 0, fs.pathToFile(tmpFile).length(), null);
+    AggregatingRecordReader rr = new AggregatingRecordReader();
+    Path ocPath = new Path(tmpFile, "oc");
+    OutputCommitter oc = new FileOutputCommitter(ocPath, context);
+    fs.deleteOnExit(ocPath);
+    StandaloneStatusReporter sr = new StandaloneStatusReporter();
+    rr.initialize(split, context);
+    MockAccumuloRecordWriter rw = new MockAccumuloRecordWriter();
+    WikipediaMapper mapper = new WikipediaMapper();
+    
+    // Load data into Mock Accumulo
+    Mapper<LongWritable,Text,Text,Mutation>.Context con = mapper.new Context(conf, id, rr, rw, oc, sr, split);
+    mapper.run(con);
+    
+    // Flush and close record writers.
+    rw.close(context);
+    
+  }
+  
+  private void debugQuery(String tableName) throws Exception {
+    Scanner s = c.createScanner(tableName, new Authorizations("all"));
+    Range r = new Range();
+    s.setRange(r);
+    for (Entry<Key,Value> entry : s)
+      System.out.println(entry.getKey().toString() + " " + entry.getValue().toString());
+  }
+  
+  public void testViewAllData() throws Exception {
+    debugQuery(METADATA_TABLE_NAME);
+    debugQuery(TABLE_NAME);
+    debugQuery(INDEX_TABLE_NAME);
+    debugQuery(RINDEX_TABLE_NAME);
+  }
+}

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapperTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/normalizer/testNumberNormalizer.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/normalizer/testNumberNormalizer.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/normalizer/testNumberNormalizer.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/normalizer/testNumberNormalizer.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.normalizer;
+
+import static org.junit.Assert.assertTrue;
+
+import org.apache.accumulo.examples.wikisearch.normalizer.NumberNormalizer;
+import org.junit.Test;
+
+public class testNumberNormalizer {
+  
+  @Test
+  public void test1() throws Exception {
+    NumberNormalizer nn = new NumberNormalizer();
+    
+    String n1 = nn.normalizeFieldValue(null, "1");
+    String n2 = nn.normalizeFieldValue(null, "1.00000000");
+    
+    assertTrue(n1.compareTo(n2) < 0);
+    
+  }
+  
+  @Test
+  public void test2() {
+    NumberNormalizer nn = new NumberNormalizer();
+    
+    String n1 = nn.normalizeFieldValue(null, "-1.0");
+    String n2 = nn.normalizeFieldValue(null, "1.0");
+    
+    assertTrue(n1.compareTo(n2) < 0);
+    
+  }
+  
+  @Test
+  public void test3() {
+    NumberNormalizer nn = new NumberNormalizer();
+    String n1 = nn.normalizeFieldValue(null, "-0.0001");
+    String n2 = nn.normalizeFieldValue(null, "0");
+    String n3 = nn.normalizeFieldValue(null, "0.00001");
+    
+    assertTrue((n1.compareTo(n2) < 0) && (n2.compareTo(n3) < 0));
+  }
+  
+  @Test
+  public void test4() {
+    NumberNormalizer nn = new NumberNormalizer();
+    String nn1 = nn.normalizeFieldValue(null, Integer.toString(Integer.MAX_VALUE));
+    String nn2 = nn.normalizeFieldValue(null, Integer.toString(Integer.MAX_VALUE - 1));
+    
+    assertTrue((nn2.compareTo(nn1) < 0));
+    
+  }
+  
+  @Test
+  public void test5() {
+    NumberNormalizer nn = new NumberNormalizer();
+    String nn1 = nn.normalizeFieldValue(null, "-0.001");
+    String nn2 = nn.normalizeFieldValue(null, "-0.0009");
+    String nn3 = nn.normalizeFieldValue(null, "-0.00090");
+    
+    assertTrue((nn3.compareTo(nn2) == 0) && (nn2.compareTo(nn1) > 0));
+    
+  }
+  
+  @Test
+  public void test6() {
+    NumberNormalizer nn = new NumberNormalizer();
+    String nn1 = nn.normalizeFieldValue(null, "00.0");
+    String nn2 = nn.normalizeFieldValue(null, "0");
+    String nn3 = nn.normalizeFieldValue(null, "0.0");
+    
+    assertTrue((nn3.compareTo(nn2) == 0) && (nn2.compareTo(nn1) == 0));
+    
+  }
+  
+}

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/normalizer/testNumberNormalizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/reader/AggregatingRecordReaderTest.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/reader/AggregatingRecordReaderTest.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/reader/AggregatingRecordReaderTest.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/reader/AggregatingRecordReaderTest.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,284 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.reader;
+
+import static org.junit.Assert.*;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.StringReader;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.xpath.XPath;
+import javax.xml.xpath.XPathExpression;
+import javax.xml.xpath.XPathFactory;
+
+import org.apache.accumulo.examples.wikisearch.reader.AggregatingRecordReader;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.TaskAttemptID;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.junit.Before;
+import org.junit.Test;
+import org.w3c.dom.Document;
+import org.xml.sax.ErrorHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.SAXParseException;
+
+public class AggregatingRecordReaderTest {
+  
+  public static class MyErrorHandler implements ErrorHandler {
+    
+    @Override
+    public void error(SAXParseException exception) throws SAXException {
+      // System.out.println(exception.getMessage());
+    }
+    
+    @Override
+    public void fatalError(SAXParseException exception) throws SAXException {
+      // System.out.println(exception.getMessage());
+    }
+    
+    @Override
+    public void warning(SAXParseException exception) throws SAXException {
+      // System.out.println(exception.getMessage());
+    }
+    
+  }
+  
+  private static final String xml1 = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + "<doc>\n" + "  <a>A</a>\n" + "  <b>B</b>\n" + "</doc>\n"
+      + "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + "<doc>\n" + "  <a>C</a>\n" + "  <b>D</b>\n" + "</doc>\n" + "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
+      + "<doc>\n" + "  <a>E</a>\n" + "  <b>F</b>\n" + "</doc>\n";
+  
+  private static final String xml2 = "  <b>B</b>\n" + "</doc>\n" + "<doc>\n" + "  <a>C</a>\n" + "  <b>D</b>\n" + "</doc>\n" + "<doc>\n" + "  <a>E</a>\n"
+      + "  <b>F</b>\n" + "</doc>\n";
+  
+  private static final String xml3 = "<doc>\n" + "  <a>A</a>\n" + "  <b>B</b>\n" + "</doc>\n" + "<doc>\n" + "  <a>C</a>\n" + "  <b>D</b>\n" + "</doc>\n"
+      + "<doc>\n" + "  <a>E</a>\n";
+  
+  private static final String xml4 = "<doc>" + "  <a>A</a>" + "  <b>B</b>" + "</doc>" + "<doc>" + "  <a>C</a>" + "  <b>D</b>" + "</doc>" + "<doc>"
+      + "  <a>E</a>" + "  <b>F</b>" + "</doc>";
+  
+  private static final String xml5 = "<doc attr=\"G\">" + "  <a>A</a>" + "  <b>B</b>" + "</doc>" + "<doc>" + "  <a>C</a>" + "  <b>D</b>" + "</doc>"
+      + "<doc attr=\"H\"/>" + "<doc>" + "  <a>E</a>" + "  <b>F</b>" + "</doc>" + "<doc attr=\"I\"/>";
+  
+  private Configuration conf = null;
+  private TaskAttemptContext ctx = null;
+  private static DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
+  private XPathFactory xpFactory = XPathFactory.newInstance();
+  private XPathExpression EXPR_A = null;
+  private XPathExpression EXPR_B = null;
+  private XPathExpression EXPR_ATTR = null;
+  
+  @Before
+  public void setUp() throws Exception {
+    conf = new Configuration();
+    conf.set(AggregatingRecordReader.START_TOKEN, "<doc");
+    conf.set(AggregatingRecordReader.END_TOKEN, "</doc>");
+    conf.set(AggregatingRecordReader.RETURN_PARTIAL_MATCHES, Boolean.toString(true));
+    ctx = new TaskAttemptContext(conf, new TaskAttemptID());
+    XPath xp = xpFactory.newXPath();
+    EXPR_A = xp.compile("/doc/a");
+    EXPR_B = xp.compile("/doc/b");
+    EXPR_ATTR = xp.compile("/doc/@attr");
+  }
+  
+  public File createFile(String data) throws Exception {
+    // Write out test file
+    File f = File.createTempFile("aggReaderTest", ".xml");
+    f.deleteOnExit();
+    FileWriter writer = new FileWriter(f);
+    writer.write(data);
+    writer.flush();
+    writer.close();
+    return f;
+  }
+  
+  private void testXML(Text xml, String aValue, String bValue, String attrValue) throws Exception {
+    StringReader reader = new StringReader(xml.toString());
+    InputSource source = new InputSource(reader);
+    
+    DocumentBuilder parser = factory.newDocumentBuilder();
+    parser.setErrorHandler(new MyErrorHandler());
+    Document root = parser.parse(source);
+    assertNotNull(root);
+    
+    reader = new StringReader(xml.toString());
+    source = new InputSource(reader);
+    assertEquals(EXPR_A.evaluate(source), aValue);
+    
+    reader = new StringReader(xml.toString());
+    source = new InputSource(reader);
+    assertEquals(EXPR_B.evaluate(source), bValue);
+    
+    reader = new StringReader(xml.toString());
+    source = new InputSource(reader);
+    assertEquals(EXPR_ATTR.evaluate(source), attrValue);
+  }
+  
+  @Test
+  public void testIncorrectArgs() throws Exception {
+    File f = createFile(xml1);
+    
+    // Create FileSplit
+    Path p = new Path(f.toURI().toString());
+    FileSplit split = new FileSplit(p, 0, f.length(), null);
+    AggregatingRecordReader reader = new AggregatingRecordReader();
+    try {
+      // Clear the values for BEGIN and STOP TOKEN
+      conf.set(AggregatingRecordReader.START_TOKEN, null);
+      conf.set(AggregatingRecordReader.END_TOKEN, null);
+      reader.initialize(split, ctx);
+      // If we got here, then the code didnt throw an exception
+      fail();
+    } catch (Exception e) {
+      // Do nothing, we succeeded
+      f = null;
+    }
+    reader.close();
+  }
+  
+  @Test
+  public void testCorrectXML() throws Exception {
+    File f = createFile(xml1);
+    
+    // Create FileSplit
+    Path p = new Path(f.toURI().toString());
+    FileSplit split = new FileSplit(p, 0, f.length(), null);
+    
+    // Initialize the RecordReader
+    AggregatingRecordReader reader = new AggregatingRecordReader();
+    reader.initialize(split, ctx);
+    assertTrue(reader.nextKeyValue());
+    testXML(reader.getCurrentValue(), "A", "B", "");
+    assertTrue(reader.nextKeyValue());
+    testXML(reader.getCurrentValue(), "C", "D", "");
+    assertTrue(reader.nextKeyValue());
+    testXML(reader.getCurrentValue(), "E", "F", "");
+    assertTrue(!reader.nextKeyValue());
+    
+  }
+  
+  @Test
+  public void testPartialXML() throws Exception {
+    File f = createFile(xml2);
+    
+    // Create FileSplit
+    Path p = new Path(f.toURI().toString());
+    FileSplit split = new FileSplit(p, 0, f.length(), null);
+    
+    // Initialize the RecordReader
+    AggregatingRecordReader reader = new AggregatingRecordReader();
+    reader.initialize(split, ctx);
+    assertTrue(reader.nextKeyValue());
+    testXML(reader.getCurrentValue(), "C", "D", "");
+    assertTrue(reader.nextKeyValue());
+    testXML(reader.getCurrentValue(), "E", "F", "");
+    assertTrue(!reader.nextKeyValue());
+  }
+  
+  public void testPartialXML2WithNoPartialRecordsReturned() throws Exception {
+    conf.set(AggregatingRecordReader.RETURN_PARTIAL_MATCHES, Boolean.toString(false));
+    File f = createFile(xml3);
+    
+    // Create FileSplit
+    Path p = new Path(f.toURI().toString());
+    FileSplit split = new FileSplit(p, 0, f.length(), null);
+    
+    // Initialize the RecordReader
+    AggregatingRecordReader reader = new AggregatingRecordReader();
+    reader.initialize(split, ctx);
+    assertTrue(reader.nextKeyValue());
+    testXML(reader.getCurrentValue(), "A", "B", "");
+    assertTrue(reader.nextKeyValue());
+    testXML(reader.getCurrentValue(), "C", "D", "");
+    assertTrue(!reader.nextKeyValue());
+  }
+  
+  @Test
+  public void testPartialXML2() throws Exception {
+    File f = createFile(xml3);
+    
+    // Create FileSplit
+    Path p = new Path(f.toURI().toString());
+    FileSplit split = new FileSplit(p, 0, f.length(), null);
+    
+    // Initialize the RecordReader
+    AggregatingRecordReader reader = new AggregatingRecordReader();
+    reader.initialize(split, ctx);
+    assertTrue(reader.nextKeyValue());
+    testXML(reader.getCurrentValue(), "A", "B", "");
+    assertTrue(reader.nextKeyValue());
+    testXML(reader.getCurrentValue(), "C", "D", "");
+    assertTrue(reader.nextKeyValue());
+    try {
+      testXML(reader.getCurrentValue(), "E", "", "");
+      fail("Fragment returned, and it somehow passed XML parsing.");
+    } catch (SAXParseException e) {
+      // ignore
+    }
+    assertTrue(!reader.nextKeyValue());
+  }
+  
+  @Test
+  public void testLineSplitting() throws Exception {
+    File f = createFile(xml4);
+    
+    // Create FileSplit
+    Path p = new Path(f.toURI().toString());
+    FileSplit split = new FileSplit(p, 0, f.length(), null);
+    
+    // Initialize the RecordReader
+    AggregatingRecordReader reader = new AggregatingRecordReader();
+    reader.initialize(split, ctx);
+    assertTrue(reader.nextKeyValue());
+    testXML(reader.getCurrentValue(), "A", "B", "");
+    assertTrue(reader.nextKeyValue());
+    testXML(reader.getCurrentValue(), "C", "D", "");
+    assertTrue(reader.nextKeyValue());
+    testXML(reader.getCurrentValue(), "E", "F", "");
+    assertTrue(!reader.nextKeyValue());
+  }
+  
+  @Test
+  public void testNoEndTokenHandling() throws Exception {
+    File f = createFile(xml5);
+    // Create FileSplit
+    Path p = new Path(f.toURI().toString());
+    FileSplit split = new FileSplit(p, 0, f.length(), null);
+    
+    // Initialize the RecordReader
+    AggregatingRecordReader reader = new AggregatingRecordReader();
+    reader.initialize(split, ctx);
+    assertTrue("Not enough records returned.", reader.nextKeyValue());
+    testXML(reader.getCurrentValue(), "A", "B", "G");
+    assertTrue("Not enough records returned.", reader.nextKeyValue());
+    testXML(reader.getCurrentValue(), "C", "D", "");
+    assertTrue("Not enough records returned.", reader.nextKeyValue());
+    testXML(reader.getCurrentValue(), "", "", "H");
+    assertTrue("Not enough records returned.", reader.nextKeyValue());
+    testXML(reader.getCurrentValue(), "E", "F", "");
+    assertTrue("Not enough records returned.", reader.nextKeyValue());
+    testXML(reader.getCurrentValue(), "", "", "I");
+    assertTrue("Too many records returned.", !reader.nextKeyValue());
+  }
+  
+}

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/java/org/apache/accumulo/examples/wikisearch/reader/AggregatingRecordReaderTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/resources/enwiki-20110901-001.xml
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/resources/enwiki-20110901-001.xml?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/resources/enwiki-20110901-001.xml (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/resources/enwiki-20110901-001.xml Thu Jan 12 16:06:14 2012
@@ -0,0 +1,153 @@
+<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.5/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.5/ http://www.mediawiki.org/xml/export-0.5.xsd" version="0.5" xml:lang="en">
+  <siteinfo>
+    <sitename>Wikipedia</sitename>
+    <base>http://en.wikipedia.org/wiki/Main_Page</base>
+    <generator>MediaWiki 1.17wmf1</generator>
+    <case>first-letter</case>
+    <namespaces>
+      <namespace key="-2" case="first-letter">Media</namespace>
+      <namespace key="-1" case="first-letter">Special</namespace>
+      <namespace key="0" case="first-letter" />
+      <namespace key="1" case="first-letter">Talk</namespace>
+      <namespace key="2" case="first-letter">User</namespace>
+      <namespace key="3" case="first-letter">User talk</namespace>
+      <namespace key="4" case="first-letter">Wikipedia</namespace>
+      <namespace key="5" case="first-letter">Wikipedia talk</namespace>
+      <namespace key="6" case="first-letter">File</namespace>
+      <namespace key="7" case="first-letter">File talk</namespace>
+      <namespace key="8" case="first-letter">MediaWiki</namespace>
+      <namespace key="9" case="first-letter">MediaWiki talk</namespace>
+      <namespace key="10" case="first-letter">Template</namespace>
+      <namespace key="11" case="first-letter">Template talk</namespace>
+      <namespace key="12" case="first-letter">Help</namespace>
+      <namespace key="13" case="first-letter">Help talk</namespace>
+      <namespace key="14" case="first-letter">Category</namespace>
+      <namespace key="15" case="first-letter">Category talk</namespace>
+      <namespace key="100" case="first-letter">Portal</namespace>
+      <namespace key="101" case="first-letter">Portal talk</namespace>
+      <namespace key="108" case="first-letter">Book</namespace>
+      <namespace key="109" case="first-letter">Book talk</namespace>
+    </namespaces>
+  </siteinfo>
+  <page>
+    <title>Abacus</title>
+    <id>655</id>
+    <revision>
+      <id>34350</id>
+      <timestamp>2002-02-25T15:43:11Z</timestamp>
+      <contributor>
+        <ip>Conversion script</ip>
+      </contributor>
+      <minor />
+      <comment>Automated conversion</comment>
+      <text xml:space="preserve">1. An '''abacus''' is a counting frame, typically wooden with balls sliding on wires.  It was first used before the adoption of the ten-digit [[Arabic numerals | Arabic numeral]] system and is still widely used by small merchants in [[China]].  The Roman abacus contains seven long and seven shorter rods or bars, the former having four perforated beads running on them and the latter one.  The bar marked 1 indicates units, X tens, and so on up to millions.  The beads on the shorter bars denote fives,--five units, five tens, etc.  The rod O and corresponding short rod are for marking ounces; and the short quarter rods for fractions of an ounce. Computations are made with it by means of balls of bone or ivory running on slender bamboo rods, similar to the simpler board, fitted up with beads strung on wires, which has been employed in teaching the rudiments of arithmetic in English schools.
+
+The '''Suan'''4-'''Pan'''2 (&amp;#31639;&amp;#30436;) of the Chinese closely resembles the Roman abacus in its construction and use.  The Chinese abacus is usally around eight inches tall and it comes in various width depending on application, it usually has more than seven rods.  There are two beads on each rod in the upper deck and five beads each in the bottom.  The beads are usually round and made of hard wood.  The abacus can be reset to the starting position instantly by a quick jerk along the horizontal axis to spin all the beads away from the horizontal beam at the center.  The beads are counted by moving them up or down towards the beam. Chinese abacus does more than just counting.  Unlike the simple counting board used in elimentary schools, very efficient Suan4-Pan2 techniques were developed to do multiplication, division, addition, substraction, square root and cubic root at high speed.  The beads and rods were often lubricated to ensure speed. When all five bead
 s in the lower deck are moved up, they are reset to the original position, and one bead in the top deck is moved down as a carry.  When both beads in the upper deck are moved down, they are reset and a bead on the adjacent rod on the left is moved up as a carry.  The result of the computation is read off from the beads clustered near the separator beam between the upper and lower deck.  In a sense, the abacus works as a 5-2-5-2-5-2... based number system in which carries and shiftings are similiar to the decimal number system.  Since each rod represents a digit in a decimal number, the computation capacity of the abacus is only limited by the number of rods on the abacus.  When a mathematician runs out of rods, he simply adds another abacus to the left of the row.  In theory, the abacus can be expanded infinitely.
+
+As recently as the late 1960s, abacus arithmetics were still being taught in school (e.g. in Hong Kong).  When hand held calculators became popular, nobody wanted to learn how to operate an abacus any more. In the early days of handheld calculators, news about abacus operators beating electronic calculator in arithmetics competitions in both speed and accuracy often appeared in the media.  The main reason being that early calculators were often plagued by rounding and overflow errors.  (Most handheld calculators can only handle 8 to 10 significant digits, the abacus is virtually limitless in precision.) Inexperienced operators might contribute to the loss too.  But when calculators' functionality improved, everyone knew that the abacus could never compute complex functions (e.g. trignometry) faster than a calculator.  The older generation (those who were born before the early 1950s) still used it for a while, but electronic calculators gradually displaced abacus in Hong Kong
  over the past four decades.  Abacus is hardly seen in Hong Kong nowadays.  However, abacuses are still being used in China and Japan.  The [[slide rule]]s also suffered a similar demise.
+
+The Suan4-Pan2 is closely tied to the [[[Chinese numerals|Chinese &quot;Hua1 Ma3&quot; numbering system]]].
+
+The Japanese eliminated one bead each from the upper and lower deck in each column of the Chinese abacus, because these beads are redundent.  That makes the Japanese '''soroban''' (&amp;#21313;&amp;#38706;&amp;#30436;) more like the Roman abacus.  The soroban is about 3 inches tall.  The beans on a soroban are usually double cone shape.
+
+Many sources also mentioned use of abacus in ancient Mayan culture.  
+The Mesoamerican abacus is closely tied to the base-20 [[Mayan numerals]] system.
+
+External Ref: 
+[[http://www.ee.ryerson.ca/~elf/abacus/ Abacus]], 
+[[http://www.soroban.com/ Soroban]], 
+[[http://www.sungwh.freeserve.co.uk/sapienti/abacus01.htm Suan Pan]], 
+[[http://hawk.hama-med.ac.jp/dbk/abacus.html Mesoamerican abacus]],
+[[http://www.dotpoint.com/xnumber/pic_roman_abacus.htm Roman abacus]]
+
+----
+
+2. (From the Greek ''abax'', a slab; or French ''abaque'', tailloir), in architecture, the upper member of the capital of a column.  Its chief function is to provide a larger supporting surface for the architrave or arch it has to carry.  In the Greek [[Doric]] order the abacus is a plain square slab.  In the Roman and Renaissance Doric orders it is crowned by a moulding.  In the Archaic-Greek [[Ionic]] order, owing to the greater width of the capital, the abacus is rectangular in plan, and consists of a carved [[ovolo]] moulding.  In later examples the abacus is square, except where there are angle [[volute]]s, when it is slightly curved over the same.  In the Roman and Renaissance Ionic capital, the abacus is square with a fillet On the top of an ogee moulding, but curved over angle volutes.  In the Greek [[Corinthian]] order the abacus is moulded, its sides are concave and its angles canted (except in one or two exceptional Greek capitals, where it is brought to a sharp a
 ngle); and the same shape is adopted in the Roman and Renaissance Corinthian and Composite capitals, in some cases with the ovolo moulding carved.  In Romanesque architecture the abacus is square with the lower edge splayed off and moulded or carved, and the same was retained in France during the medieval period; but in England, in Early English work, a circular deeply moulded abacus was introduced, which in the 14th and 15th centuries was transformed into an octagonal one.  The diminutive of abacus, [[abaciscus]], is applied in architecture to the chequers or squares of a tessellated pavement.
+
+----
+
+3. (possibly defunct) The name of abacus is also given, in [[logic]], to an instrument, often called the &quot;logical machine&quot;, analogous to the mathematical abacus.  It is constructed to show all the possible combinations of a set of logical terms with their negatives, and, further, the way in which these combinations are affected by the addition of attributes or other limiting words, i.e., to simplify mechanically the solution of logical problems.  These instruments are all more or less elaborate developments of the &quot;logical slate&quot;, on which were written in vertical columns all the combinations of symbols or letters which could be made logically out of a definite number of terms.  These were compared with any given premises, and those which were incompatible were crossed off.  In the abacus the combinations are inscribed each on a single slip of wood or similar substance, which is moved by a key; incompatible combinations can thus be mechanically removed at
  will, in accordance with any given series of premises.
+
+----
+
+see also:
+* [[slide rule]]
+
+[[talk:Abacus|Talk]]
+</text>
+    </revision>
+  </page>
+  <page>
+    <title>Acid</title>
+    <id>656</id>
+    <revision>
+      <id>46344</id>
+      <timestamp>2002-02-25T15:43:11Z</timestamp>
+      <contributor>
+        <ip>Conversion script</ip>
+      </contributor>
+      <minor />
+      <comment>Automated conversion</comment>
+      <text xml:space="preserve">An '''acid''' is a chemical generally defined by its reactions with complementary chemicals, designated [[base]]s. See [[Acid-base reaction theories]].
+
+Some of the stronger acids include the hydrohalic acids - HCl, HBr, and HI - and the oxyacids, which tend to contain central atoms in high oxidation states surrounded by oxygen - including HNO&lt;sub&gt;3&lt;/sub&gt; and H&lt;sub&gt;2&lt;/sub&gt;SO&lt;sub&gt;4&lt;/sub&gt;.
+
+
+Acidity is typically measured using the [[pH]] scale.
+
+----
+See also:
+
+&quot;Acid&quot; is also a slang word referring to [[LSD]].
+
+'''ACID''' is an acronym that expands to four essential properties of a [[database management system]].
+See [[ACID properties]].
+</text>
+    </revision>
+  </page>
+  <page>
+    <title>Asphalt</title>
+    <id>657</id>
+    <revision>
+      <id>29335</id>
+      <timestamp>2002-02-25T15:43:11Z</timestamp>
+      <contributor>
+        <ip>Conversion script</ip>
+      </contributor>
+      <minor />
+      <comment>Automated conversion</comment>
+      <text xml:space="preserve">'''Asphalt''' (also called [[bitumen]]) is a material that occurs naturally in most crude [[petroleum]]s. It is commonly used to build the surface of roads.
+</text>
+    </revision>
+  </page>
+  <page>
+    <title>Acronym</title>
+    <id>658</id>
+    <redirect />
+    <revision>
+      <id>60824</id>
+      <timestamp>2002-02-25T15:43:11Z</timestamp>
+      <contributor>
+        <ip>Conversion script</ip>
+      </contributor>
+      <minor />
+      <comment>Automated conversion</comment>
+      <text xml:space="preserve">An '''acronym''' is an [[abbreviation]], often composed of the initial letters of the words in a short phrase, that is treated as word (often, a piece of jargon or the proper name of an organization).  For example, SAM for [[''s''urface-to-''a''ir ''m''issile]] and [[NATO]] for the [[North Atlantic Treaty Organization]].  In its original meaning, acronyms were restricted to ''pronouncible'' abbreviations (what might be called ''true'' acronyms), though common usage permits calling unpronouncable abbreviations acronyms as well. Sometimes conjuntions and prepositions (such as and or to) contribute letters to make the acronym pronouncible, in contradiction to the normal [[English language|English]] rule for abbreviations.
+
+Often, an acronym will come into such wide use that people think of it as a word in itself, forget that it started out as an acronym, and write in in small letters. Examples include [[quasar]] (''q''uasi-''s''tellar ''r''adio ''s''ource), [[laser]] (''l''ight ''a''mplification by ''s''timulated ''e''mission of ''r''adiation) and radar (''r''adio ''d''etection ''a''nd ''r''anging).
+
+Non-pronouncible abbreviations formed from initials (such as IBM for International Business Machines) are sometimes called '''[[initialism]]s'''.
+
+Some lists of acronyms in use:
+
+*[[Internet slang|acronyms used on the Internet]]
+*[[Acronym/List|list of acronyms]]
+*[[Acronym/Medical List|list of medical acronyms]]
+
+A large list of acronyms may be found at http://www.acronymfinder.com/
+
+[[talk:Acronym|/Talk]]
+</text>
+    </revision>
+  </page>
+</mediawiki>

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/test/resources/enwiki-20110901-001.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/trunk/src/examples/wikisearch/pom.xml
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/pom.xml?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/pom.xml (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/pom.xml Thu Jan 12 16:06:14 2012
@@ -0,0 +1,253 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements. See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License. You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+        <artifactId>accumulo-examples</artifactId>
+        <groupId>org.apache.accumulo</groupId>
+        <version>1.5.0-incubating-SNAPSHOT</version>
+        <relativePath>../</relativePath>
+  </parent>
+  <artifactId>accumulo-wikisearch</artifactId>
+  <packaging>pom</packaging>
+  <name>accumulo-wikisearch</name>
+
+  <modules>
+    <module>ingest</module>
+    <module>query</module>
+    <module>query-war</module>
+  </modules>
+
+  <repositories>
+    <repository>
+      <id>central</id>
+      <name>Maven Repository Switchboard</name>
+      <layout>default</layout>
+      <url>http://repo1.maven.org/maven2</url>
+      <snapshots>
+        <enabled>false</enabled>
+      </snapshots>
+    </repository>
+    <repository>
+      <id>java.net</id>
+      <name>java.net</name>
+      <layout>default</layout>
+      <url>https://maven.java.net/content/groups/public</url>
+      <snapshots>
+        <enabled>false</enabled>
+      </snapshots>
+    </repository>
+  </repositories>
+
+  <build>
+    <defaultGoal>package</defaultGoal>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-enforcer-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>enforce-mvn</id>
+            <goals>
+              <goal>enforce</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <artifactId>maven-clean-plugin</artifactId>
+        <configuration>
+          <filesets>
+            <fileset>
+              <directory>lib</directory>
+            </fileset>
+            <fileset>
+              <directory>target</directory>
+            </fileset>
+          </filesets>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-jar-plugin</artifactId>
+        <configuration>
+          <outputDirectory>lib</outputDirectory>
+          <archive>
+            <manifest>
+              <addDefaultImplementationEntries>true</addDefaultImplementationEntries>
+            </manifest>
+            <manifestEntries>
+              <Implementation-Build>${buildNumber}</Implementation-Build>
+              <Implementation-Timestamp>${timestamp}</Implementation-Timestamp>
+            </manifestEntries>
+          </archive>
+          <includes>
+            <include>**/**</include>
+          </includes>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-resources-plugin</artifactId>
+        <configuration>
+          <encoding>UTF-8</encoding>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-javadoc-plugin</artifactId>
+        <configuration>
+          <encoding>UTF-8</encoding>
+          <quiet>true</quiet>
+          <jarOutputDirectory>lib</jarOutputDirectory>
+          <reportOutputDirectory>docs</reportOutputDirectory>
+          <javadocVersion>1.6</javadocVersion>
+          <additionalJOption>-J-Xmx512m</additionalJOption>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-source-plugin</artifactId>
+        <configuration>
+          <outputDirectory>lib</outputDirectory>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-surefire-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <inherited>false</inherited>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>copy-dependencies</id>
+            <phase>process-resources</phase>
+            <goals>
+              <goal>copy-dependencies</goal>
+            </goals>
+            <configuration>
+              <outputDirectory>../../lib</outputDirectory>
+              <!-- just grab the non-provided runtime dependencies -->
+              <includeArtifactIds>commons-collections,commons-configuration,commons-io,commons-lang,jline,log4j,libthrift,commons-jci-core,commons-jci-fam,commons-logging,commons-logging-api,cloudtrace</includeArtifactIds>
+              <excludeGroupIds>accumulo</excludeGroupIds>
+              <excludeTransitive>true</excludeTransitive>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <version.commons-lang>2.4</version.commons-lang>
+    <version.accumulo>${project.version}</version.accumulo>
+    <version.commons-jexl>2.0.1</version.commons-jexl>
+    <version.commons-codec>1.5</version.commons-codec>
+    <version.ejb-spec-api>1.0.1.Final</version.ejb-spec-api>
+    <version.jaxrs>2.1.0.GA</version.jaxrs>
+    <version.kryo>1.04</version.kryo>
+    <version.log4j>1.2.16</version.log4j>
+    <version.log4j-extras>1.0</version.log4j-extras>
+    <version.lucene>3.0.2</version.lucene>
+    <version.lucene-analyzers>3.0.2</version.lucene-analyzers>
+    <version.lucene-wikipedia>3.0.2</version.lucene-wikipedia>
+    <version.protobuf>2.3.0</version.protobuf>
+    <version.googlecollections>1.0</version.googlecollections>
+    <version.libthrift>0.6.1</version.libthrift>
+    <version.zookeeper>3.3.1</version.zookeeper>
+    <version.minlog>1.2</version.minlog>
+  </properties>
+
+  <dependencyManagement>
+    <dependencies>
+      <dependency>
+        <groupId>commons-codec</groupId>
+        <artifactId>commons-codec</artifactId>
+        <version>${version.commons-codec}</version>
+      </dependency>
+      <dependency>
+        <groupId>commons-lang</groupId>
+        <artifactId>commons-lang</artifactId>
+        <version>${version.commons-lang}</version>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.accumulo</groupId>
+        <artifactId>accumulo-core</artifactId>
+        <version>${version.accumulo}</version>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.hadoop</groupId>
+        <artifactId>zookeeper</artifactId>
+        <version>${version.zookeeper}</version>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.thrift</groupId>
+        <artifactId>libthrift</artifactId>
+        <version>${version.libthrift}</version>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.accumulo</groupId>
+        <artifactId>cloudtrace</artifactId>
+        <version>${version.accumulo}</version>
+      </dependency>
+      <dependency>
+        <groupId>com.google.collections</groupId>
+        <artifactId>google-collections</artifactId>
+        <version>${version.googlecollections}</version>
+      </dependency>
+      <dependency>
+        <groupId>com.googlecode</groupId>
+        <artifactId>kryo</artifactId>
+        <version>${version.kryo}</version>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.lucene</groupId>
+        <artifactId>lucene-core</artifactId>
+        <version>${version.lucene}</version>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.lucene</groupId>
+        <artifactId>lucene-analyzers</artifactId>
+        <version>${version.lucene-analyzers}</version>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.lucene</groupId>
+        <artifactId>lucene-wikipedia</artifactId>
+        <version>${version.lucene-wikipedia}</version>
+      </dependency>
+      <dependency>
+        <groupId>com.google.protobuf</groupId>
+        <artifactId>protobuf-java</artifactId>
+        <version>${version.protobuf}</version>
+      </dependency>
+      <dependency>
+        <groupId>org.apache.commons</groupId>
+        <artifactId>commons-jexl</artifactId>
+        <version>${version.commons-jexl}</version>
+      </dependency>
+      <dependency>
+        <groupId>com.googlecode</groupId>
+        <artifactId>minlog</artifactId>
+        <version>${version.minlog}</version>
+      </dependency>
+      <dependency>
+      	<groupId>com.sun.jersey</groupId>
+      	<artifactId>jersey-server</artifactId>
+      	<version>1.11</version>
+      </dependency>
+    </dependencies>
+  </dependencyManagement>
+
+</project>

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/pom.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/query/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Thu Jan 12 16:06:14 2012
@@ -0,0 +1,3 @@
+.*
+target
+lib

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/query-war/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Thu Jan 12 16:06:14 2012
@@ -0,0 +1,2 @@
+.*
+target