You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@accumulo.apache.org by ec...@apache.org on 2012/01/06 23:02:13 UTC
svn commit: r1228459 [2/13] - in /incubator/accumulo/branches/1.4: ./ contrib/accumulo_sample/ src/examples/src/main/java/org/apache/accumulo/examples/wikisearch/ src/trace/ src/wikisearch/ src/wikisearch/ingest/ src/wikisearch/ingest/bin/ src/wikisear...

Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/iterator/TotalAggregatingIterator.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/iterator/TotalAggregatingIterator.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/iterator/TotalAggregatingIterator.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/iterator/TotalAggregatingIterator.java Fri Jan  6 22:02:09 2012
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.wikisearch.iterator;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.accumulo.core.data.ByteSequence;
+import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.PartialKey;
+import org.apache.accumulo.core.data.Range;
+import org.apache.accumulo.core.data.Value;
+import org.apache.accumulo.core.iterators.IteratorEnvironment;
+import org.apache.accumulo.core.iterators.OptionDescriber;
+import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
+import org.apache.accumulo.core.iterators.aggregation.Aggregator;
+import org.apache.accumulo.core.iterators.conf.ColumnToClassMapping;
+import org.apache.accumulo.start.classloader.AccumuloClassLoader;
+
+/**
+ * Aggregate all values with the same key (row, colf, colq, colVis.).
+ * 
+ */
+
+public class TotalAggregatingIterator implements SortedKeyValueIterator<Key,Value>, OptionDescriber {
+  
+  private SortedKeyValueIterator<Key,Value> iterator;
+  
+  private Key workKey = new Key();
+  
+  private Key aggrKey;
+  private Value aggrValue;
+  
+  private Aggregator agg;
+  
+  public TotalAggregatingIterator deepCopy(IteratorEnvironment env) {
+    return new TotalAggregatingIterator(this, env);
+  }
+  
+  private TotalAggregatingIterator(TotalAggregatingIterator other, IteratorEnvironment env) {
+    iterator = other.iterator.deepCopy(env);
+    agg = other.agg;
+  }
+  
+  public TotalAggregatingIterator() {}
+  
+  private void aggregateRowColumn(Aggregator aggr) throws IOException {
+    // this function assumes that first value is not delete
+    
+    workKey.set(iterator.getTopKey());
+    
+    Key keyToAggregate = workKey;
+    
+    aggr.reset();
+    
+    aggr.collect(iterator.getTopValue());
+    iterator.next();
+    
+    while (iterator.hasTop() && iterator.getTopKey().equals(keyToAggregate, PartialKey.ROW_COLFAM_COLQUAL_COLVIS)) {
+      aggr.collect(iterator.getTopValue());
+      iterator.next();
+    }
+    
+    aggrKey = workKey;
+    aggrValue = aggr.aggregate();
+    
+  }
+  
+  private void findTop() throws IOException {
+    // check if aggregation is needed
+    if (iterator.hasTop()) {
+      aggregateRowColumn(agg);
+    }
+  }
+  
+  public TotalAggregatingIterator(SortedKeyValueIterator<Key,Value> iterator, ColumnToClassMapping<Aggregator> aggregators) throws IOException {
+    this.iterator = iterator;
+  }
+  
+  @Override
+  public Key getTopKey() {
+    if (aggrKey != null) {
+      return aggrKey;
+    }
+    return iterator.getTopKey();
+  }
+  
+  @Override
+  public Value getTopValue() {
+    if (aggrKey != null) {
+      return aggrValue;
+    }
+    return iterator.getTopValue();
+  }
+  
+  @Override
+  public boolean hasTop() {
+    return aggrKey != null || iterator.hasTop();
+  }
+  
+  @Override
+  public void next() throws IOException {
+    if (aggrKey != null) {
+      aggrKey = null;
+      aggrValue = null;
+    } else {
+      iterator.next();
+    }
+    
+    findTop();
+  }
+  
+  @Override
+  public void seek(Range range, Collection<ByteSequence> columnFamilies, boolean inclusive) throws IOException {
+    // do not want to seek to the middle of a value that should be
+    // aggregated...
+    
+    Range seekRange = maximizeStartKeyTimeStamp(range);
+    
+    iterator.seek(seekRange, columnFamilies, inclusive);
+    findTop();
+    
+    if (range.getStartKey() != null) {
+      while (hasTop() && getTopKey().equals(range.getStartKey(), PartialKey.ROW_COLFAM_COLQUAL_COLVIS)
+          && getTopKey().getTimestamp() > range.getStartKey().getTimestamp()) {
+        // the value has a more recent time stamp, so
+        // pass it up
+        // log.debug("skipping "+getTopKey());
+        next();
+      }
+      
+      while (hasTop() && range.beforeStartKey(getTopKey())) {
+        next();
+      }
+    }
+    
+  }
+  
+  @Override
+  public void init(SortedKeyValueIterator<Key,Value> source, Map<String,String> options, IteratorEnvironment env) throws IOException {
+    agg = createAggregator(options);
+    this.iterator = source;
+  }
+  
+  @Override
+  public IteratorOptions describeOptions() {
+    return new IteratorOptions("agg", "Aggregators apply aggregating functions to values with identical keys", null,
+        Collections.singletonList("* <aggregatorClass>"));
+  }
+  
+  @Override
+  public boolean validateOptions(Map<String,String> options) {
+    if (options.size() > 1)
+      throw new IllegalArgumentException("This iterator only accepts one configuration option, the name of the aggregating class");
+    agg = createAggregator(options);
+    return true;
+  }
+  
+  private Aggregator createAggregator(Map<String,String> options) {
+    Aggregator a = null;
+    for (Entry<String,String> entry : options.entrySet()) {
+      try {
+        Class<? extends Aggregator> clazz = AccumuloClassLoader.loadClass(entry.getValue(), Aggregator.class);
+        a = clazz.newInstance();
+      } catch (ClassNotFoundException e) {
+        throw new IllegalArgumentException("class not found: " + entry.getValue());
+      } catch (InstantiationException e) {
+        throw new IllegalArgumentException("instantiation exception: " + entry.getValue());
+      } catch (IllegalAccessException e) {
+        throw new IllegalArgumentException("illegal access exception: " + entry.getValue());
+      }
+    }
+    return a;
+  }
+  
+  static Range maximizeStartKeyTimeStamp(Range range) {
+    Range seekRange = range;
+    
+    if (range.getStartKey() != null && range.getStartKey().getTimestamp() != Long.MAX_VALUE) {
+      Key seekKey = new Key(seekRange.getStartKey());
+      seekKey.setTimestamp(Long.MAX_VALUE);
+      seekRange = new Range(seekKey, true, range.getEndKey(), range.isEndKeyInclusive());
+    }
+    
+    return seekRange;
+  }
+}

Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/iterator/TotalAggregatingIterator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/LcNoDiacriticsNormalizer.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/LcNoDiacriticsNormalizer.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/LcNoDiacriticsNormalizer.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/LcNoDiacriticsNormalizer.java Fri Jan  6 22:02:09 2012
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.wikisearch.normalizer;
+
+import java.text.Normalizer;
+import java.text.Normalizer.Form;
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * An {@link Normalizer} which performs the following steps:
+ * <ol>
+ * <li>Unicode canonical decomposition ({@link Form#NFD})</li>
+ * <li>Removal of diacritical marks</li>
+ * <li>Unicode canonical composition ({@link Form#NFC})</li>
+ * <li>lower casing in the {@link Locale#ENGLISH English local}
+ * </ol>
+ */
+public class LcNoDiacriticsNormalizer implements org.apache.accumulo.wikisearch.normalizer.Normalizer {
+  private static final Pattern diacriticals = Pattern.compile("\\p{InCombiningDiacriticalMarks}");
+  
+  public String normalizeFieldValue(String fieldName, Object fieldValue) {
+    String decomposed = Normalizer.normalize(fieldValue.toString(), Form.NFD);
+    String noDiacriticals = removeDiacriticalMarks(decomposed);
+    String recomposed = Normalizer.normalize(noDiacriticals, Form.NFC);
+    return recomposed.toLowerCase(Locale.ENGLISH);
+  }
+  
+  private String removeDiacriticalMarks(String str) {
+    Matcher matcher = diacriticals.matcher(str);
+    return matcher.replaceAll("");
+  }
+  
+}

Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/LcNoDiacriticsNormalizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/NoOpNormalizer.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/NoOpNormalizer.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/NoOpNormalizer.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/NoOpNormalizer.java Fri Jan  6 22:02:09 2012
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.wikisearch.normalizer;
+
+public class NoOpNormalizer implements Normalizer {
+  public String normalizeFieldValue(String field, Object value) {
+    return value.toString();
+  }
+}

Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/NoOpNormalizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/Normalizer.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/Normalizer.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/Normalizer.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/Normalizer.java Fri Jan  6 22:02:09 2012
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.wikisearch.normalizer;
+
+public interface Normalizer {
+  
+  /**
+   * Creates normalized content for ingest based upon implemented logic.
+   * 
+   * @param field
+   *          The field being normalized
+   * @param value
+   *          The value to normalize
+   * @return a normalized value
+   */
+  public String normalizeFieldValue(String field, Object value);
+  
+}

Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/Normalizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/NumberNormalizer.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/NumberNormalizer.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/NumberNormalizer.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/NumberNormalizer.java Fri Jan  6 22:02:09 2012
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.wikisearch.normalizer;
+
+import org.apache.commons.lang.math.NumberUtils;
+import org.apache.lucene.util.NumericUtils;
+
+public class NumberNormalizer implements Normalizer {
+  
+  public String normalizeFieldValue(String field, Object value) {
+    if (NumberUtils.isNumber(value.toString())) {
+      Number n = NumberUtils.createNumber(value.toString());
+      if (n instanceof Integer)
+        return NumericUtils.intToPrefixCoded((Integer) n);
+      else if (n instanceof Long)
+        return NumericUtils.longToPrefixCoded((Long) n);
+      else if (n instanceof Float)
+        return NumericUtils.floatToPrefixCoded((Float) n);
+      else if (n instanceof Double)
+        return NumericUtils.doubleToPrefixCoded((Double) n);
+      else
+        throw new IllegalArgumentException("Unhandled numeric type: " + n.getClass());
+    } else {
+      throw new IllegalArgumentException("Value is not a number: " + value);
+    }
+  }
+  
+}

Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/NumberNormalizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/protobuf/TermWeight.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/protobuf/TermWeight.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/protobuf/TermWeight.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/protobuf/TermWeight.java Fri Jan  6 22:02:09 2012
@@ -0,0 +1,424 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: TermWeight.proto
+
+package org.apache.accumulo.wikisearch.protobuf;
+
+public final class TermWeight {
+  private TermWeight() {}
+  
+  public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {}
+  
+  public static final class Info extends com.google.protobuf.GeneratedMessage {
+    // Use Info.newBuilder() to construct.
+    private Info() {
+      initFields();
+    }
+    
+    private Info(boolean noInit) {}
+    
+    private static final Info defaultInstance;
+    
+    public static Info getDefaultInstance() {
+      return defaultInstance;
+    }
+    
+    public Info getDefaultInstanceForType() {
+      return defaultInstance;
+    }
+    
+    public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() {
+      return org.apache.accumulo.wikisearch.protobuf.TermWeight.internal_static_protobuf_Info_descriptor;
+    }
+    
+    protected com.google.protobuf.GeneratedMessage.FieldAccessorTable internalGetFieldAccessorTable() {
+      return org.apache.accumulo.wikisearch.protobuf.TermWeight.internal_static_protobuf_Info_fieldAccessorTable;
+    }
+    
+    // required float normalizedTermFrequency = 1;
+    public static final int NORMALIZEDTERMFREQUENCY_FIELD_NUMBER = 1;
+    private boolean hasNormalizedTermFrequency;
+    private float normalizedTermFrequency_ = 0F;
+    
+    public boolean hasNormalizedTermFrequency() {
+      return hasNormalizedTermFrequency;
+    }
+    
+    public float getNormalizedTermFrequency() {
+      return normalizedTermFrequency_;
+    }
+    
+    // repeated uint32 wordOffset = 2;
+    public static final int WORDOFFSET_FIELD_NUMBER = 2;
+    private java.util.List<java.lang.Integer> wordOffset_ = java.util.Collections.emptyList();
+    
+    public java.util.List<java.lang.Integer> getWordOffsetList() {
+      return wordOffset_;
+    }
+    
+    public int getWordOffsetCount() {
+      return wordOffset_.size();
+    }
+    
+    public int getWordOffset(int index) {
+      return wordOffset_.get(index);
+    }
+    
+    private void initFields() {}
+    
+    public final boolean isInitialized() {
+      if (!hasNormalizedTermFrequency)
+        return false;
+      return true;
+    }
+    
+    public void writeTo(com.google.protobuf.CodedOutputStream output) throws java.io.IOException {
+      getSerializedSize();
+      if (hasNormalizedTermFrequency()) {
+        output.writeFloat(1, getNormalizedTermFrequency());
+      }
+      for (int element : getWordOffsetList()) {
+        output.writeUInt32(2, element);
+      }
+      getUnknownFields().writeTo(output);
+    }
+    
+    private int memoizedSerializedSize = -1;
+    
+    public int getSerializedSize() {
+      int size = memoizedSerializedSize;
+      if (size != -1)
+        return size;
+      
+      size = 0;
+      if (hasNormalizedTermFrequency()) {
+        size += com.google.protobuf.CodedOutputStream.computeFloatSize(1, getNormalizedTermFrequency());
+      }
+      {
+        int dataSize = 0;
+        for (int element : getWordOffsetList()) {
+          dataSize += com.google.protobuf.CodedOutputStream.computeUInt32SizeNoTag(element);
+        }
+        size += dataSize;
+        size += 1 * getWordOffsetList().size();
+      }
+      size += getUnknownFields().getSerializedSize();
+      memoizedSerializedSize = size;
+      return size;
+    }
+    
+    public static org.apache.accumulo.wikisearch.protobuf.TermWeight.Info parseFrom(com.google.protobuf.ByteString data) throws com.google.protobuf.InvalidProtocolBufferException {
+      return newBuilder().mergeFrom(data).buildParsed();
+    }
+    
+    public static org.apache.accumulo.wikisearch.protobuf.TermWeight.Info parseFrom(com.google.protobuf.ByteString data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+        throws com.google.protobuf.InvalidProtocolBufferException {
+      return newBuilder().mergeFrom(data, extensionRegistry).buildParsed();
+    }
+    
+    public static org.apache.accumulo.wikisearch.protobuf.TermWeight.Info parseFrom(byte[] data) throws com.google.protobuf.InvalidProtocolBufferException {
+      return newBuilder().mergeFrom(data).buildParsed();
+    }
+    
+    public static org.apache.accumulo.wikisearch.protobuf.TermWeight.Info parseFrom(byte[] data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+        throws com.google.protobuf.InvalidProtocolBufferException {
+      return newBuilder().mergeFrom(data, extensionRegistry).buildParsed();
+    }
+    
+    public static org.apache.accumulo.wikisearch.protobuf.TermWeight.Info parseFrom(java.io.InputStream input) throws java.io.IOException {
+      return newBuilder().mergeFrom(input).buildParsed();
+    }
+    
+    public static org.apache.accumulo.wikisearch.protobuf.TermWeight.Info parseFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+        throws java.io.IOException {
+      return newBuilder().mergeFrom(input, extensionRegistry).buildParsed();
+    }
+    
+    public static org.apache.accumulo.wikisearch.protobuf.TermWeight.Info parseDelimitedFrom(java.io.InputStream input) throws java.io.IOException {
+      Builder builder = newBuilder();
+      if (builder.mergeDelimitedFrom(input)) {
+        return builder.buildParsed();
+      } else {
+        return null;
+      }
+    }
+    
+    public static org.apache.accumulo.wikisearch.protobuf.TermWeight.Info parseDelimitedFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+        throws java.io.IOException {
+      Builder builder = newBuilder();
+      if (builder.mergeDelimitedFrom(input, extensionRegistry)) {
+        return builder.buildParsed();
+      } else {
+        return null;
+      }
+    }
+    
+    public static org.apache.accumulo.wikisearch.protobuf.TermWeight.Info parseFrom(com.google.protobuf.CodedInputStream input) throws java.io.IOException {
+      return newBuilder().mergeFrom(input).buildParsed();
+    }
+    
+    public static org.apache.accumulo.wikisearch.protobuf.TermWeight.Info parseFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+        throws java.io.IOException {
+      return newBuilder().mergeFrom(input, extensionRegistry).buildParsed();
+    }
+    
+    public static Builder newBuilder() {
+      return Builder.create();
+    }
+    
+    public Builder newBuilderForType() {
+      return newBuilder();
+    }
+    
+    public static Builder newBuilder(org.apache.accumulo.wikisearch.protobuf.TermWeight.Info prototype) {
+      return newBuilder().mergeFrom(prototype);
+    }
+    
+    public Builder toBuilder() {
+      return newBuilder(this);
+    }
+    
+    public static final class Builder extends com.google.protobuf.GeneratedMessage.Builder<Builder> {
+      private org.apache.accumulo.wikisearch.protobuf.TermWeight.Info result;
+      
+      // Construct using protobuf.TermWeight.Info.newBuilder()
+      private Builder() {}
+      
+      private static Builder create() {
+        Builder builder = new Builder();
+        builder.result = new org.apache.accumulo.wikisearch.protobuf.TermWeight.Info();
+        return builder;
+      }
+      
+      protected org.apache.accumulo.wikisearch.protobuf.TermWeight.Info internalGetResult() {
+        return result;
+      }
+      
+      public Builder clear() {
+        if (result == null) {
+          throw new IllegalStateException("Cannot call clear() after build().");
+        }
+        result = new org.apache.accumulo.wikisearch.protobuf.TermWeight.Info();
+        return this;
+      }
+      
+      public Builder clone() {
+        return create().mergeFrom(result);
+      }
+      
+      public com.google.protobuf.Descriptors.Descriptor getDescriptorForType() {
+        return org.apache.accumulo.wikisearch.protobuf.TermWeight.Info.getDescriptor();
+      }
+      
+      public org.apache.accumulo.wikisearch.protobuf.TermWeight.Info getDefaultInstanceForType() {
+        return org.apache.accumulo.wikisearch.protobuf.TermWeight.Info.getDefaultInstance();
+      }
+      
+      public boolean isInitialized() {
+        return result.isInitialized();
+      }
+      
+      public org.apache.accumulo.wikisearch.protobuf.TermWeight.Info build() {
+        if (result != null && !isInitialized()) {
+          throw newUninitializedMessageException(result);
+        }
+        return buildPartial();
+      }
+      
+      private org.apache.accumulo.wikisearch.protobuf.TermWeight.Info buildParsed() throws com.google.protobuf.InvalidProtocolBufferException {
+        if (!isInitialized()) {
+          throw newUninitializedMessageException(result).asInvalidProtocolBufferException();
+        }
+        return buildPartial();
+      }
+      
+      public org.apache.accumulo.wikisearch.protobuf.TermWeight.Info buildPartial() {
+        if (result == null) {
+          throw new IllegalStateException("build() has already been called on this Builder.");
+        }
+        if (result.wordOffset_ != java.util.Collections.EMPTY_LIST) {
+          result.wordOffset_ = java.util.Collections.unmodifiableList(result.wordOffset_);
+        }
+        org.apache.accumulo.wikisearch.protobuf.TermWeight.Info returnMe = result;
+        result = null;
+        return returnMe;
+      }
+      
+      public Builder mergeFrom(com.google.protobuf.Message other) {
+        if (other instanceof org.apache.accumulo.wikisearch.protobuf.TermWeight.Info) {
+          return mergeFrom((org.apache.accumulo.wikisearch.protobuf.TermWeight.Info) other);
+        } else {
+          super.mergeFrom(other);
+          return this;
+        }
+      }
+      
+      public Builder mergeFrom(org.apache.accumulo.wikisearch.protobuf.TermWeight.Info other) {
+        if (other == org.apache.accumulo.wikisearch.protobuf.TermWeight.Info.getDefaultInstance())
+          return this;
+        if (other.hasNormalizedTermFrequency()) {
+          setNormalizedTermFrequency(other.getNormalizedTermFrequency());
+        }
+        if (!other.wordOffset_.isEmpty()) {
+          if (result.wordOffset_.isEmpty()) {
+            result.wordOffset_ = new java.util.ArrayList<java.lang.Integer>();
+          }
+          result.wordOffset_.addAll(other.wordOffset_);
+        }
+        this.mergeUnknownFields(other.getUnknownFields());
+        return this;
+      }
+      
+      public Builder mergeFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+          throws java.io.IOException {
+        com.google.protobuf.UnknownFieldSet.Builder unknownFields = com.google.protobuf.UnknownFieldSet.newBuilder(this.getUnknownFields());
+        while (true) {
+          int tag = input.readTag();
+          switch (tag) {
+            case 0:
+              this.setUnknownFields(unknownFields.build());
+              return this;
+            default: {
+              if (!parseUnknownField(input, unknownFields, extensionRegistry, tag)) {
+                this.setUnknownFields(unknownFields.build());
+                return this;
+              }
+              break;
+            }
+            case 13: {
+              setNormalizedTermFrequency(input.readFloat());
+              break;
+            }
+            case 16: {
+              addWordOffset(input.readUInt32());
+              break;
+            }
+            case 18: {
+              int length = input.readRawVarint32();
+              int limit = input.pushLimit(length);
+              while (input.getBytesUntilLimit() > 0) {
+                addWordOffset(input.readUInt32());
+              }
+              input.popLimit(limit);
+              break;
+            }
+          }
+        }
+      }
+      
+      // required float normalizedTermFrequency = 1;
+      public boolean hasNormalizedTermFrequency() {
+        return result.hasNormalizedTermFrequency();
+      }
+      
+      public float getNormalizedTermFrequency() {
+        return result.getNormalizedTermFrequency();
+      }
+      
+      public Builder setNormalizedTermFrequency(float value) {
+        result.hasNormalizedTermFrequency = true;
+        result.normalizedTermFrequency_ = value;
+        return this;
+      }
+      
+      public Builder clearNormalizedTermFrequency() {
+        result.hasNormalizedTermFrequency = false;
+        result.normalizedTermFrequency_ = 0F;
+        return this;
+      }
+      
+      // repeated uint32 wordOffset = 2;
+      public java.util.List<java.lang.Integer> getWordOffsetList() {
+        return java.util.Collections.unmodifiableList(result.wordOffset_);
+      }
+      
+      public int getWordOffsetCount() {
+        return result.getWordOffsetCount();
+      }
+      
+      public int getWordOffset(int index) {
+        return result.getWordOffset(index);
+      }
+      
+      public Builder setWordOffset(int index, int value) {
+        result.wordOffset_.set(index, value);
+        return this;
+      }
+      
+      public Builder addWordOffset(int value) {
+        if (result.wordOffset_.isEmpty()) {
+          result.wordOffset_ = new java.util.ArrayList<java.lang.Integer>();
+        }
+        result.wordOffset_.add(value);
+        return this;
+      }
+      
+      public Builder addAllWordOffset(java.lang.Iterable<? extends java.lang.Integer> values) {
+        if (result.wordOffset_.isEmpty()) {
+          result.wordOffset_ = new java.util.ArrayList<java.lang.Integer>();
+        }
+        super.addAll(values, result.wordOffset_);
+        return this;
+      }
+      
+      public Builder clearWordOffset() {
+        result.wordOffset_ = java.util.Collections.emptyList();
+        return this;
+      }
+      
+      // @@protoc_insertion_point(builder_scope:protobuf.Info)
+    }
+    
+    static {
+      defaultInstance = new Info(true);
+      org.apache.accumulo.wikisearch.protobuf.TermWeight.internalForceInit();
+      defaultInstance.initFields();
+    }
+    
+    // @@protoc_insertion_point(class_scope:protobuf.Info)
+  }
+  
+  private static com.google.protobuf.Descriptors.Descriptor internal_static_protobuf_Info_descriptor;
+  private static com.google.protobuf.GeneratedMessage.FieldAccessorTable internal_static_protobuf_Info_fieldAccessorTable;
+  
+  public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
+    return descriptor;
+  }
+  
+  private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
+  static {
+    java.lang.String[] descriptorData = {"\n\020TermWeight.proto\022\010protobuf\";\n\004Info\022\037\n\027"
+        + "normalizedTermFrequency\030\001 \002(\002\022\022\n\nwordOff" + "set\030\002 \003(\rB\014\n\010protobufH\001"};
+    com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner = new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() {
+      public com.google.protobuf.ExtensionRegistry assignDescriptors(com.google.protobuf.Descriptors.FileDescriptor root) {
+        descriptor = root;
+        internal_static_protobuf_Info_descriptor = getDescriptor().getMessageTypes().get(0);
+        internal_static_protobuf_Info_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable(
+            internal_static_protobuf_Info_descriptor, new java.lang.String[] {"NormalizedTermFrequency", "WordOffset",}, org.apache.accumulo.wikisearch.protobuf.TermWeight.Info.class,
+            org.apache.accumulo.wikisearch.protobuf.TermWeight.Info.Builder.class);
+        return null;
+      }
+    };
+    com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[] {},
+        assigner);
+  }
+  
+  public static void internalForceInit() {}
+  
+  // @@protoc_insertion_point(outer_class_scope)
+}

Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/protobuf/TermWeight.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/protobuf/Uid.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/protobuf/Uid.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/protobuf/Uid.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/protobuf/Uid.java Fri Jan  6 22:02:09 2012
@@ -0,0 +1,470 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: Uid.proto
+
+package org.apache.accumulo.wikisearch.protobuf;
+
+public final class Uid {
+  private Uid() {}
+  
+  public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {}
+  
+  public static final class List extends com.google.protobuf.GeneratedMessage {
+    // Use List.newBuilder() to construct.
+    private List() {
+      initFields();
+    }
+    
+    private List(boolean noInit) {}
+    
+    private static final List defaultInstance;
+    
+    public static List getDefaultInstance() {
+      return defaultInstance;
+    }
+    
+    public List getDefaultInstanceForType() {
+      return defaultInstance;
+    }
+    
+    public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() {
+      return org.apache.accumulo.wikisearch.protobuf.Uid.internal_static_protobuf_List_descriptor;
+    }
+    
+    protected com.google.protobuf.GeneratedMessage.FieldAccessorTable internalGetFieldAccessorTable() {
+      return org.apache.accumulo.wikisearch.protobuf.Uid.internal_static_protobuf_List_fieldAccessorTable;
+    }
+    
+    // required bool IGNORE = 1;
+    public static final int IGNORE_FIELD_NUMBER = 1;
+    private boolean hasIGNORE;
+    private boolean iGNORE_ = false;
+    
+    public boolean hasIGNORE() {
+      return hasIGNORE;
+    }
+    
+    public boolean getIGNORE() {
+      return iGNORE_;
+    }
+    
+    // required uint64 COUNT = 2;
+    public static final int COUNT_FIELD_NUMBER = 2;
+    private boolean hasCOUNT;
+    private long cOUNT_ = 0L;
+    
+    public boolean hasCOUNT() {
+      return hasCOUNT;
+    }
+    
+    public long getCOUNT() {
+      return cOUNT_;
+    }
+    
+    // repeated string UID = 3;
+    public static final int UID_FIELD_NUMBER = 3;
+    private java.util.List<java.lang.String> uID_ = java.util.Collections.emptyList();
+    
+    public java.util.List<java.lang.String> getUIDList() {
+      return uID_;
+    }
+    
+    public int getUIDCount() {
+      return uID_.size();
+    }
+    
+    public java.lang.String getUID(int index) {
+      return uID_.get(index);
+    }
+    
+    private void initFields() {}
+    
+    public final boolean isInitialized() {
+      if (!hasIGNORE)
+        return false;
+      if (!hasCOUNT)
+        return false;
+      return true;
+    }
+    
+    public void writeTo(com.google.protobuf.CodedOutputStream output) throws java.io.IOException {
+      getSerializedSize();
+      if (hasIGNORE()) {
+        output.writeBool(1, getIGNORE());
+      }
+      if (hasCOUNT()) {
+        output.writeUInt64(2, getCOUNT());
+      }
+      for (java.lang.String element : getUIDList()) {
+        output.writeString(3, element);
+      }
+      getUnknownFields().writeTo(output);
+    }
+    
+    private int memoizedSerializedSize = -1;
+    
+    public int getSerializedSize() {
+      int size = memoizedSerializedSize;
+      if (size != -1)
+        return size;
+      
+      size = 0;
+      if (hasIGNORE()) {
+        size += com.google.protobuf.CodedOutputStream.computeBoolSize(1, getIGNORE());
+      }
+      if (hasCOUNT()) {
+        size += com.google.protobuf.CodedOutputStream.computeUInt64Size(2, getCOUNT());
+      }
+      {
+        int dataSize = 0;
+        for (java.lang.String element : getUIDList()) {
+          dataSize += com.google.protobuf.CodedOutputStream.computeStringSizeNoTag(element);
+        }
+        size += dataSize;
+        size += 1 * getUIDList().size();
+      }
+      size += getUnknownFields().getSerializedSize();
+      memoizedSerializedSize = size;
+      return size;
+    }
+    
+    public static org.apache.accumulo.wikisearch.protobuf.Uid.List parseFrom(com.google.protobuf.ByteString data) throws com.google.protobuf.InvalidProtocolBufferException {
+      return newBuilder().mergeFrom(data).buildParsed();
+    }
+    
+    public static org.apache.accumulo.wikisearch.protobuf.Uid.List parseFrom(com.google.protobuf.ByteString data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+        throws com.google.protobuf.InvalidProtocolBufferException {
+      return newBuilder().mergeFrom(data, extensionRegistry).buildParsed();
+    }
+    
+    public static org.apache.accumulo.wikisearch.protobuf.Uid.List parseFrom(byte[] data) throws com.google.protobuf.InvalidProtocolBufferException {
+      return newBuilder().mergeFrom(data).buildParsed();
+    }
+    
+    public static org.apache.accumulo.wikisearch.protobuf.Uid.List parseFrom(byte[] data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+        throws com.google.protobuf.InvalidProtocolBufferException {
+      return newBuilder().mergeFrom(data, extensionRegistry).buildParsed();
+    }
+    
+    public static org.apache.accumulo.wikisearch.protobuf.Uid.List parseFrom(java.io.InputStream input) throws java.io.IOException {
+      return newBuilder().mergeFrom(input).buildParsed();
+    }
+    
+    public static org.apache.accumulo.wikisearch.protobuf.Uid.List parseFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+        throws java.io.IOException {
+      return newBuilder().mergeFrom(input, extensionRegistry).buildParsed();
+    }
+    
+    public static org.apache.accumulo.wikisearch.protobuf.Uid.List parseDelimitedFrom(java.io.InputStream input) throws java.io.IOException {
+      Builder builder = newBuilder();
+      if (builder.mergeDelimitedFrom(input)) {
+        return builder.buildParsed();
+      } else {
+        return null;
+      }
+    }
+    
+    public static org.apache.accumulo.wikisearch.protobuf.Uid.List parseDelimitedFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+        throws java.io.IOException {
+      Builder builder = newBuilder();
+      if (builder.mergeDelimitedFrom(input, extensionRegistry)) {
+        return builder.buildParsed();
+      } else {
+        return null;
+      }
+    }
+    
+    public static org.apache.accumulo.wikisearch.protobuf.Uid.List parseFrom(com.google.protobuf.CodedInputStream input) throws java.io.IOException {
+      return newBuilder().mergeFrom(input).buildParsed();
+    }
+    
+    public static org.apache.accumulo.wikisearch.protobuf.Uid.List parseFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+        throws java.io.IOException {
+      return newBuilder().mergeFrom(input, extensionRegistry).buildParsed();
+    }
+    
+    public static Builder newBuilder() {
+      return Builder.create();
+    }
+    
+    public Builder newBuilderForType() {
+      return newBuilder();
+    }
+    
+    public static Builder newBuilder(org.apache.accumulo.wikisearch.protobuf.Uid.List prototype) {
+      return newBuilder().mergeFrom(prototype);
+    }
+    
+    public Builder toBuilder() {
+      return newBuilder(this);
+    }
+    
+    public static final class Builder extends com.google.protobuf.GeneratedMessage.Builder<Builder> {
+      private org.apache.accumulo.wikisearch.protobuf.Uid.List result;
+      
+      // Construct using protobuf.Uid.List.newBuilder()
+      private Builder() {}
+      
+      private static Builder create() {
+        Builder builder = new Builder();
+        builder.result = new org.apache.accumulo.wikisearch.protobuf.Uid.List();
+        return builder;
+      }
+      
+      protected org.apache.accumulo.wikisearch.protobuf.Uid.List internalGetResult() {
+        return result;
+      }
+      
+      public Builder clear() {
+        if (result == null) {
+          throw new IllegalStateException("Cannot call clear() after build().");
+        }
+        result = new org.apache.accumulo.wikisearch.protobuf.Uid.List();
+        return this;
+      }
+      
+      public Builder clone() {
+        return create().mergeFrom(result);
+      }
+      
+      public com.google.protobuf.Descriptors.Descriptor getDescriptorForType() {
+        return org.apache.accumulo.wikisearch.protobuf.Uid.List.getDescriptor();
+      }
+      
+      public org.apache.accumulo.wikisearch.protobuf.Uid.List getDefaultInstanceForType() {
+        return org.apache.accumulo.wikisearch.protobuf.Uid.List.getDefaultInstance();
+      }
+      
+      public boolean isInitialized() {
+        return result.isInitialized();
+      }
+      
+      public org.apache.accumulo.wikisearch.protobuf.Uid.List build() {
+        if (result != null && !isInitialized()) {
+          throw newUninitializedMessageException(result);
+        }
+        return buildPartial();
+      }
+      
+      private org.apache.accumulo.wikisearch.protobuf.Uid.List buildParsed() throws com.google.protobuf.InvalidProtocolBufferException {
+        if (!isInitialized()) {
+          throw newUninitializedMessageException(result).asInvalidProtocolBufferException();
+        }
+        return buildPartial();
+      }
+      
+      public org.apache.accumulo.wikisearch.protobuf.Uid.List buildPartial() {
+        if (result == null) {
+          throw new IllegalStateException("build() has already been called on this Builder.");
+        }
+        if (result.uID_ != java.util.Collections.EMPTY_LIST) {
+          result.uID_ = java.util.Collections.unmodifiableList(result.uID_);
+        }
+        org.apache.accumulo.wikisearch.protobuf.Uid.List returnMe = result;
+        result = null;
+        return returnMe;
+      }
+      
+      public Builder mergeFrom(com.google.protobuf.Message other) {
+        if (other instanceof org.apache.accumulo.wikisearch.protobuf.Uid.List) {
+          return mergeFrom((org.apache.accumulo.wikisearch.protobuf.Uid.List) other);
+        } else {
+          super.mergeFrom(other);
+          return this;
+        }
+      }
+      
+      public Builder mergeFrom(org.apache.accumulo.wikisearch.protobuf.Uid.List other) {
+        if (other == org.apache.accumulo.wikisearch.protobuf.Uid.List.getDefaultInstance())
+          return this;
+        if (other.hasIGNORE()) {
+          setIGNORE(other.getIGNORE());
+        }
+        if (other.hasCOUNT()) {
+          setCOUNT(other.getCOUNT());
+        }
+        if (!other.uID_.isEmpty()) {
+          if (result.uID_.isEmpty()) {
+            result.uID_ = new java.util.ArrayList<java.lang.String>();
+          }
+          result.uID_.addAll(other.uID_);
+        }
+        this.mergeUnknownFields(other.getUnknownFields());
+        return this;
+      }
+      
+      public Builder mergeFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+          throws java.io.IOException {
+        com.google.protobuf.UnknownFieldSet.Builder unknownFields = com.google.protobuf.UnknownFieldSet.newBuilder(this.getUnknownFields());
+        while (true) {
+          int tag = input.readTag();
+          switch (tag) {
+            case 0:
+              this.setUnknownFields(unknownFields.build());
+              return this;
+            default: {
+              if (!parseUnknownField(input, unknownFields, extensionRegistry, tag)) {
+                this.setUnknownFields(unknownFields.build());
+                return this;
+              }
+              break;
+            }
+            case 8: {
+              setIGNORE(input.readBool());
+              break;
+            }
+            case 16: {
+              setCOUNT(input.readUInt64());
+              break;
+            }
+            case 26: {
+              addUID(input.readString());
+              break;
+            }
+          }
+        }
+      }
+      
+      // required bool IGNORE = 1;
+      public boolean hasIGNORE() {
+        return result.hasIGNORE();
+      }
+      
+      public boolean getIGNORE() {
+        return result.getIGNORE();
+      }
+      
+      public Builder setIGNORE(boolean value) {
+        result.hasIGNORE = true;
+        result.iGNORE_ = value;
+        return this;
+      }
+      
+      public Builder clearIGNORE() {
+        result.hasIGNORE = false;
+        result.iGNORE_ = false;
+        return this;
+      }
+      
+      // required uint64 COUNT = 2;
+      public boolean hasCOUNT() {
+        return result.hasCOUNT();
+      }
+      
+      public long getCOUNT() {
+        return result.getCOUNT();
+      }
+      
+      public Builder setCOUNT(long value) {
+        result.hasCOUNT = true;
+        result.cOUNT_ = value;
+        return this;
+      }
+      
+      public Builder clearCOUNT() {
+        result.hasCOUNT = false;
+        result.cOUNT_ = 0L;
+        return this;
+      }
+      
+      // repeated string UID = 3;
+      public java.util.List<java.lang.String> getUIDList() {
+        return java.util.Collections.unmodifiableList(result.uID_);
+      }
+      
+      public int getUIDCount() {
+        return result.getUIDCount();
+      }
+      
+      public java.lang.String getUID(int index) {
+        return result.getUID(index);
+      }
+      
+      public Builder setUID(int index, java.lang.String value) {
+        if (value == null) {
+          throw new NullPointerException();
+        }
+        result.uID_.set(index, value);
+        return this;
+      }
+      
+      public Builder addUID(java.lang.String value) {
+        if (value == null) {
+          throw new NullPointerException();
+        }
+        if (result.uID_.isEmpty()) {
+          result.uID_ = new java.util.ArrayList<java.lang.String>();
+        }
+        result.uID_.add(value);
+        return this;
+      }
+      
+      public Builder addAllUID(java.lang.Iterable<? extends java.lang.String> values) {
+        if (result.uID_.isEmpty()) {
+          result.uID_ = new java.util.ArrayList<java.lang.String>();
+        }
+        super.addAll(values, result.uID_);
+        return this;
+      }
+      
+      public Builder clearUID() {
+        result.uID_ = java.util.Collections.emptyList();
+        return this;
+      }
+      
+      // @@protoc_insertion_point(builder_scope:protobuf.List)
+    }
+    
+    static {
+      defaultInstance = new List(true);
+      org.apache.accumulo.wikisearch.protobuf.Uid.internalForceInit();
+      defaultInstance.initFields();
+    }
+    
+    // @@protoc_insertion_point(class_scope:protobuf.List)
+  }
+  
+  private static com.google.protobuf.Descriptors.Descriptor internal_static_protobuf_List_descriptor;
+  private static com.google.protobuf.GeneratedMessage.FieldAccessorTable internal_static_protobuf_List_fieldAccessorTable;
+  
+  public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
+    return descriptor;
+  }
+  
+  private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
+  static {
+    java.lang.String[] descriptorData = {"\n\tUid.proto\022\010protobuf\"2\n\004List\022\016\n\006IGNORE\030"
+        + "\001 \002(\010\022\r\n\005COUNT\030\002 \002(\004\022\013\n\003UID\030\003 \003(\tB\014\n\010pro" + "tobufH\001"};
+    com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner = new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() {
+      public com.google.protobuf.ExtensionRegistry assignDescriptors(com.google.protobuf.Descriptors.FileDescriptor root) {
+        descriptor = root;
+        internal_static_protobuf_List_descriptor = getDescriptor().getMessageTypes().get(0);
+        internal_static_protobuf_List_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable(
+            internal_static_protobuf_List_descriptor, new java.lang.String[] {"IGNORE", "COUNT", "UID",}, org.apache.accumulo.wikisearch.protobuf.Uid.List.class,
+            org.apache.accumulo.wikisearch.protobuf.Uid.List.Builder.class);
+        return null;
+      }
+    };
+    com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[] {},
+        assigner);
+  }
+  
+  public static void internalForceInit() {}
+  
+  // @@protoc_insertion_point(outer_class_scope)
+}

Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/protobuf/Uid.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/AggregatingRecordReader.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/AggregatingRecordReader.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/AggregatingRecordReader.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/AggregatingRecordReader.java Fri Jan  6 22:02:09 2012
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.wikisearch.reader;
+
+
+import java.io.IOException;
+
+import org.apache.accumulo.wikisearch.ingest.WikipediaConfiguration;
+import org.apache.accumulo.wikisearch.util.TextUtil;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+
+
+/**
+ * This class aggregates Text values based on a start and end filter. An example use case for this would be XML data. This will not work with data that has
+ * nested start and stop tokens.
+ * 
+ */
+public class AggregatingRecordReader extends LongLineRecordReader {
+  
+  public static final String START_TOKEN = "aggregating.token.start";
+  public static final String END_TOKEN = "aggregating.token.end";
+  public static final String RETURN_PARTIAL_MATCHES = "aggregating.allow.partial";
+  
+  private LongWritable key = new LongWritable();
+  private String startToken = null;
+  private String endToken = null;
+  private long counter = 0;
+  private Text aggValue = new Text();
+  private boolean startFound = false;
+  private StringBuilder remainder = new StringBuilder(0);
+  private boolean returnPartialMatches = false;
+  
+  @Override
+  public LongWritable getCurrentKey() {
+    key.set(counter);
+    return key;
+  }
+  
+  @Override
+  public Text getCurrentValue() {
+    return aggValue;
+  }
+  
+  @Override
+  public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
+    super.initialize(genericSplit, context);
+    this.startToken = WikipediaConfiguration.isNull(context.getConfiguration(), START_TOKEN, String.class);
+    this.endToken = WikipediaConfiguration.isNull(context.getConfiguration(), END_TOKEN, String.class);
+    this.returnPartialMatches = context.getConfiguration().getBoolean(RETURN_PARTIAL_MATCHES, false);
+    
+    /*
+     * Text-appending works almost exactly like the + operator on Strings- it creates a byte array exactly the size of [prefix + suffix] and dumps the bytes
+     * into the new array. This module works by doing lots of little additions, one line at a time. With most XML, the documents are partitioned on line
+     * boundaries, so we will generally have lots of additions. Setting a large default byte array for a text object can avoid this and give us
+     * StringBuilder-like functionality for Text objects.
+     */
+    byte[] txtBuffer = new byte[2048];
+    aggValue.set(txtBuffer);
+  }
+  
+  @Override
+  public boolean nextKeyValue() throws IOException {
+    aggValue.clear();
+    boolean hasNext = false;
+    boolean finished = false;
+    // Find the start token
+    while (!finished && (((hasNext = super.nextKeyValue()) == true) || remainder.length() > 0)) {
+      if (hasNext)
+        finished = process(super.getCurrentValue());
+      else
+        finished = process(null);
+      if (finished) {
+        startFound = false;
+        counter++;
+        return true;
+      }
+    }
+    // If we have anything loaded in the agg value (and we found a start)
+    // then we ran out of data before finding the end. Just return the
+    // data we have and if it's not valid, downstream parsing of the data
+    // will fail.
+    if (returnPartialMatches && startFound && aggValue.getLength() > 0) {
+      startFound = false;
+      counter++;
+      return true;
+    }
+    return false;
+  }
+  
+  /**
+   * Populates aggValue with the contents of the Text object.
+   * 
+   * @param t
+   * @return true if aggValue is complete, else false and needs more data.
+   */
+  private boolean process(Text t) {
+    
+    if (null != t)
+      remainder.append(t.toString());
+    while (remainder.length() > 0) {
+      if (!startFound) {
+        // If found, then begin aggregating at the start offset
+        int start = remainder.indexOf(startToken);
+        if (-1 != start) {
+          // Append the start token to the aggregate value
+          TextUtil.textAppendNoNull(aggValue, remainder.substring(start, start + startToken.length()), false);
+          // Remove to the end of the start token from the remainder
+          remainder.delete(0, start + startToken.length());
+          startFound = true;
+        } else {
+          // If we are looking for the start and have not found it, then remove
+          // the bytes
+          remainder.delete(0, remainder.length());
+        }
+      } else {
+        // Try to find the end
+        int end = remainder.indexOf(endToken);
+        // Also try to find the start
+        int start = remainder.indexOf(startToken);
+        if (-1 == end) {
+          if (returnPartialMatches && start >= 0) {
+            // End token not found, but another start token was found...
+            // The amount to copy is up to the beginning of the next start token
+            TextUtil.textAppendNoNull(aggValue, remainder.substring(0, start), false);
+            remainder.delete(0, start);
+            return true;
+          } else {
+            // Not found, aggregate the entire remainder
+            TextUtil.textAppendNoNull(aggValue, remainder.toString(), false);
+            // Delete all chars from remainder
+            remainder.delete(0, remainder.length());
+          }
+        } else {
+          if (returnPartialMatches && start >= 0 && start < end) {
+            // We found the end token, but found another start token first, so
+            // deal with that.
+            TextUtil.textAppendNoNull(aggValue, remainder.substring(0, start), false);
+            remainder.delete(0, start);
+            return true;
+          } else {
+            // END_TOKEN was found. Extract to the end of END_TOKEN
+            TextUtil.textAppendNoNull(aggValue, remainder.substring(0, end + endToken.length()), false);
+            // Remove from remainder up to the end of END_TOKEN
+            remainder.delete(0, end + endToken.length());
+            return true;
+          }
+        }
+      }
+    }
+    return false;
+  }
+  
+}

Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/AggregatingRecordReader.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/LfLineReader.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/LfLineReader.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/LfLineReader.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/LfLineReader.java Fri Jan  6 22:02:09 2012
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.wikisearch.reader;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+
+/**
+ * A class that provides a line reader from an input stream.
+ */
+public class LfLineReader {
+  private static final int DEFAULT_BUFFER_SIZE = 64 * 1024;
+  private int bufferSize = DEFAULT_BUFFER_SIZE;
+  private InputStream in;
+  private byte[] buffer;
+  // the number of bytes of real data in the buffer
+  private int bufferLength = 0;
+  // the current position in the buffer
+  private int bufferPosn = 0;
+  
+  private static final byte LF = '\n';
+  
+  /**
+   * Create a line reader that reads from the given stream using the default buffer-size (64k).
+   * 
+   * @param in
+   *          The input stream
+   * @throws IOException
+   */
+  public LfLineReader(InputStream in) {
+    this(in, DEFAULT_BUFFER_SIZE);
+  }
+  
+  /**
+   * Create a line reader that reads from the given stream using the given buffer-size.
+   * 
+   * @param in
+   *          The input stream
+   * @param bufferSize
+   *          Size of the read buffer
+   * @throws IOException
+   */
+  public LfLineReader(InputStream in, int bufferSize) {
+    this.in = in;
+    this.bufferSize = bufferSize;
+    this.buffer = new byte[this.bufferSize];
+  }
+  
+  /**
+   * Create a line reader that reads from the given stream using the <code>io.file.buffer.size</code> specified in the given <code>Configuration</code>.
+   * 
+   * @param in
+   *          input stream
+   * @param conf
+   *          configuration
+   * @throws IOException
+   */
+  public LfLineReader(InputStream in, Configuration conf) throws IOException {
+    this(in, conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE));
+  }
+  
+  /**
+   * Close the underlying stream.
+   * 
+   * @throws IOException
+   */
+  public void close() throws IOException {
+    in.close();
+  }
+  
+  /**
+   * Read one line from the InputStream into the given Text. A line can be terminated by '\n' (LF). EOF also terminates an otherwise unterminated line.
+   * 
+   * @param str
+   *          the object to store the given line (without newline)
+   * @param maxLineLength
+   *          the maximum number of bytes to store into str; the rest of the line is silently discarded.
+   * @param maxBytesToConsume
+   *          the maximum number of bytes to consume in this call. This is only a hint, because if the line cross this threshold, we allow it to happen. It can
+   *          overshoot potentially by as much as one buffer length.
+   * 
+   * @return the number of bytes read including the (longest) newline found.
+   * 
+   * @throws IOException
+   *           if the underlying stream throws
+   */
+  public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
+    /*
+     * We're reading data from in, but the head of the stream may be already buffered in buffer, so we have several cases: 1. No newline characters are in the
+     * buffer, so we need to copy everything and read another buffer from the stream. 2. An unambiguously terminated line is in buffer, so we just copy to str.
+     */
+    str.clear();
+    int txtLength = 0; // tracks str.getLength(), as an optimization
+    int newlineLength = 0; // length of terminating newline
+    long bytesConsumed = 0;
+    do {
+      int startPosn = bufferPosn; // starting from where we left off the last time
+      if (bufferPosn >= bufferLength) {
+        startPosn = bufferPosn = 0;
+        bufferLength = in.read(buffer);
+        if (bufferLength <= 0)
+          break; // EOF
+      }
+      for (; bufferPosn < bufferLength; ++bufferPosn) { // search for newline
+        if (buffer[bufferPosn] == LF) {
+          newlineLength = 1;
+          ++bufferPosn; // at next invocation proceed from following byte
+          break;
+        }
+      }
+      int readLength = bufferPosn - startPosn;
+      bytesConsumed += readLength;
+      int appendLength = readLength - newlineLength;
+      if (appendLength > maxLineLength - txtLength) {
+        appendLength = maxLineLength - txtLength;
+      }
+      if (appendLength > 0) {
+        str.append(buffer, startPosn, appendLength);
+        txtLength += appendLength;
+      }
+    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);
+    
+    if (bytesConsumed > Integer.MAX_VALUE)
+      throw new IOException("Too many bytes before newline: " + bytesConsumed);
+    return (int) bytesConsumed;
+  }
+  
+  /**
+   * Read from the InputStream into the given Text.
+   * 
+   * @param str
+   *          the object to store the given line
+   * @param maxLineLength
+   *          the maximum number of bytes to store into str.
+   * @return the number of bytes read including the newline
+   * @throws IOException
+   *           if the underlying stream throws
+   */
+  public int readLine(Text str, int maxLineLength) throws IOException {
+    return readLine(str, maxLineLength, Integer.MAX_VALUE);
+  }
+  
+  /**
+   * Read from the InputStream into the given Text.
+   * 
+   * @param str
+   *          the object to store the given line
+   * @return the number of bytes read including the newline
+   * @throws IOException
+   *           if the underlying stream throws
+   */
+  public int readLine(Text str) throws IOException {
+    return readLine(str, Integer.MAX_VALUE, Integer.MAX_VALUE);
+  }
+  
+}

Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/LfLineReader.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/LongLineRecordReader.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/LongLineRecordReader.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/LongLineRecordReader.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/LongLineRecordReader.java Fri Jan  6 22:02:09 2012
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.wikisearch.reader;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.CompressionCodecFactory;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
+import org.apache.hadoop.util.LineReader;
+
+/**
+ * A copy of {@link LineRecordReader} which does not discard lines longer than "mapred.linerecordreader.maxlength". Instead, it returns them, leaving it to the
+ * mapper to decide what to do with it. It also does not treat '\r' (CR) characters as new lines -- it uses {@link LfLineReader} instead of {@link LineReader}
+ * to read lines.
+ */
+public class LongLineRecordReader extends RecordReader<LongWritable,Text> {
+  private CompressionCodecFactory compressionCodecs = null;
+  private long start;
+  private long pos;
+  private long end;
+  private LfLineReader in;
+  private int maxLineLength;
+  private LongWritable key = null;
+  private Text value = null;
+  
+  @Override
+  public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
+    FileSplit split = (FileSplit) genericSplit;
+    Configuration job = context.getConfiguration();
+    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
+    start = split.getStart();
+    end = start + split.getLength();
+    final Path file = split.getPath();
+    compressionCodecs = new CompressionCodecFactory(job);
+    final CompressionCodec codec = compressionCodecs.getCodec(file);
+    
+    // open the file and seek to the start of the split
+    FileSystem fs = file.getFileSystem(job);
+    FSDataInputStream fileIn = fs.open(split.getPath());
+    boolean skipFirstLine = false;
+    if (codec != null) {
+      in = new LfLineReader(codec.createInputStream(fileIn), job);
+      end = Long.MAX_VALUE;
+    } else {
+      if (start != 0) {
+        skipFirstLine = true;
+        --start;
+        fileIn.seek(start);
+      }
+      in = new LfLineReader(fileIn, job);
+    }
+    if (skipFirstLine) { // skip first line and re-establish "start".
+      start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
+    }
+    this.pos = start;
+  }
+  
+  @Override
+  public boolean nextKeyValue() throws IOException {
+    if (key == null) {
+      key = new LongWritable();
+    }
+    key.set(pos);
+    if (value == null) {
+      value = new Text();
+    }
+    int newSize = 0;
+    if (pos < end) {
+      newSize = in.readLine(value, maxLineLength, Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength));
+      if (newSize != 0) {
+        pos += newSize;
+      }
+    }
+    if (newSize == 0) {
+      key = null;
+      value = null;
+      return false;
+    } else {
+      return true;
+    }
+  }
+  
+  @Override
+  public LongWritable getCurrentKey() {
+    return key;
+  }
+  
+  @Override
+  public Text getCurrentValue() {
+    return value;
+  }
+  
+  /**
+   * Get the progress within the split
+   */
+  @Override
+  public float getProgress() {
+    if (start == end) {
+      return 0.0f;
+    } else {
+      return Math.min(1.0f, (pos - start) / (float) (end - start));
+    }
+  }
+  
+  @Override
+  public synchronized void close() throws IOException {
+    if (in != null) {
+      in.close();
+    }
+  }
+}

Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/LongLineRecordReader.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/util/TextUtil.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/util/TextUtil.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/util/TextUtil.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/util/TextUtil.java Fri Jan  6 22:02:09 2012
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.wikisearch.util;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import org.apache.hadoop.io.Text;
+import org.apache.accumulo.core.iterators.aggregation.LongSummation;
+
+public class TextUtil {
+  
+  /**
+   * Appends a null byte followed by the UTF-8 bytes of the given string to the given {@link Text}
+   * 
+   * @param text
+   *          the Text to which to append
+   * @param string
+   *          the String to append
+   */
+  public static void textAppend(Text text, String string) {
+    appendNullByte(text);
+    textAppendNoNull(text, string);
+  }
+  
+  public static void textAppend(Text text, String string, boolean replaceBadChar) {
+    appendNullByte(text);
+    textAppendNoNull(text, string, replaceBadChar);
+  }
+  
+  public static void textAppend(Text t, long s) {
+    t.append(nullByte, 0, 1);
+    t.append(LongSummation.longToBytes(s), 0, 8);
+  }
+  
+  private static final byte[] nullByte = {0};
+  
+  /**
+   * Appends a null byte to the given text
+   * 
+   * @param text
+   *          the text to which to append the null byte
+   */
+  public static void appendNullByte(Text text) {
+    text.append(nullByte, 0, nullByte.length);
+  }
+  
+  /**
+   * Appends the UTF-8 bytes of the given string to the given {@link Text}
+   * 
+   * @param text
+   *          the Text to which to append
+   * @param string
+   *          the String to append
+   */
+  public static void textAppendNoNull(Text t, String s) {
+    textAppendNoNull(t, s, false);
+  }
+  
+  /**
+   * Appends the UTF-8 bytes of the given string to the given {@link Text}
+   * 
+   * @param t
+   * @param s
+   * @param replaceBadChar
+   */
+  public static void textAppendNoNull(Text t, String s, boolean replaceBadChar) {
+    try {
+      ByteBuffer buffer = Text.encode(s, replaceBadChar);
+      t.append(buffer.array(), 0, buffer.limit());
+    } catch (CharacterCodingException cce) {
+      throw new IllegalArgumentException(cce);
+    }
+  }
+  
+  /**
+   * Converts the given string its UTF-8 bytes. This uses Hadoop's method for converting string to UTF-8 and is much faster than calling
+   * {@link String#getBytes(String)}.
+   * 
+   * @param string
+   *          the string to convert
+   * @return the UTF-8 representation of the string
+   */
+  public static byte[] toUtf8(String string) {
+    ByteBuffer buffer;
+    try {
+      buffer = Text.encode(string, false);
+    } catch (CharacterCodingException cce) {
+      throw new IllegalArgumentException(cce);
+    }
+    byte[] bytes = new byte[buffer.limit()];
+    System.arraycopy(buffer.array(), 0, bytes, 0, bytes.length);
+    return bytes;
+  }
+}

Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/util/TextUtil.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/TermWeight.proto
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/TermWeight.proto?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/TermWeight.proto (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/TermWeight.proto Fri Jan  6 22:02:09 2012
@@ -0,0 +1,28 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// compile with protoc --java_out ../java
+// compile extra builder util with java accumulo.data.protobuf.builder.ProtoBufBuilder -d ../java accumulo.data.protobuf.UidList
+//      classpath for compile command should include ../../../target/classes and protobuf-java-2.2.0.jar
+
+package protobuf;
+
+option java_package = "protobuf";
+option optimize_for = SPEED;
+
+message Info {
+	required float normalizedTermFrequency = 1;
+	repeated uint32 wordOffset = 2;
+}

Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/Uid.proto
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/Uid.proto?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/Uid.proto (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/Uid.proto Fri Jan  6 22:02:09 2012
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// compile with protoc --java_out ../java
+// compile extra builder util with java accumulo.data.protobuf.builder.ProtoBufBuilder -d ../java accumulo.data.protobuf.UidList
+//      classpath for compile command should include ../../../target/classes and protobuf-java-2.2.0.jar
+
+package protobuf;
+
+option java_package = "protobuf";
+option optimize_for = SPEED;
+
+message List {
+  required bool IGNORE = 1;
+  required uint64 COUNT = 2;
+  repeated string UID = 3;
+}

Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/compile_protos.sh
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/compile_protos.sh?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/compile_protos.sh (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/compile_protos.sh Fri Jan  6 22:02:09 2012
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+for PROTO in `ls -1 *proto`; do protoc --java_out ../java $PROTO; done

Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/compile_protos.sh
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/compile_protos.sh
------------------------------------------------------------------------------
    svn:executable = *