You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@accumulo.apache.org by ec...@apache.org on 2012/01/12 17:06:20 UTC
svn commit: r1230608 [5/16] - in /incubator/accumulo/trunk: ./ contrib/accumulo_sample/ src/assemble/ src/core/ src/core/src/main/java/org/apache/accumulo/core/client/impl/thrift/ src/core/src/main/java/org/apache/accumulo/core/master/thrift/ src/core/...

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaIngester.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaIngester.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaIngester.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaIngester.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,211 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.ingest;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.accumulo.core.client.AccumuloException;
+import org.apache.accumulo.core.client.AccumuloSecurityException;
+import org.apache.accumulo.core.client.Connector;
+import org.apache.accumulo.core.client.TableExistsException;
+import org.apache.accumulo.core.client.TableNotFoundException;
+import org.apache.accumulo.core.client.admin.TableOperations;
+import org.apache.accumulo.core.client.mapreduce.AccumuloOutputFormat;
+import org.apache.accumulo.core.conf.Property;
+import org.apache.accumulo.core.data.Mutation;
+import org.apache.accumulo.core.iterators.IteratorUtil.IteratorScope;
+import org.apache.accumulo.core.iterators.aggregation.NumSummation;
+import org.apache.accumulo.core.iterators.aggregation.conf.AggregatorConfiguration;
+import org.apache.accumulo.examples.wikisearch.reader.AggregatingRecordReader;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+
+@SuppressWarnings("deprecation")
+public class WikipediaIngester extends Configured implements Tool {
+  
+  public final static String INGEST_LANGUAGE = "wikipedia.ingest_language";
+  public final static String SPLIT_FILE = "wikipedia.split_file";
+  public final static String TABLE_NAME = "wikipedia.table";
+  
+  public static void main(String[] args) throws Exception {
+    int res = ToolRunner.run(new Configuration(), new WikipediaIngester(), args);
+    System.exit(res);
+  }
+  
+  private void createTables(TableOperations tops, String tableName) throws AccumuloException, AccumuloSecurityException, TableNotFoundException,
+      TableExistsException {
+    // Create the shard table
+    String indexTableName = tableName + "Index";
+    String reverseIndexTableName = tableName + "ReverseIndex";
+    String metadataTableName = tableName + "Metadata";
+    
+    // create the shard table
+    if (!tops.exists(tableName)) {
+      // Set a text index aggregator on the given field names. No aggregator is set if the option is not supplied
+      String textIndexFamilies = WikipediaMapper.TOKENS_FIELD_NAME;
+      
+      if (textIndexFamilies.length() > 0) {
+        System.out.println("Adding content aggregator on the fields: " + textIndexFamilies);
+        
+        // Create and set the aggregators in one shot
+        List<AggregatorConfiguration> aggregators = new ArrayList<AggregatorConfiguration>();
+        
+        for (String family : StringUtils.split(textIndexFamilies, ',')) {
+          aggregators.add(new AggregatorConfiguration(new Text("fi\0" + family), org.apache.accumulo.examples.wikisearch.aggregator.TextIndexAggregator.class.getName()));
+        }
+        
+        tops.create(tableName);
+        tops.addAggregators(tableName, aggregators);
+      } else {
+        tops.create(tableName);
+      }
+      
+      // Set the locality group for the full content column family
+      tops.setLocalityGroups(tableName, Collections.singletonMap("WikipediaDocuments", Collections.singleton(new Text(WikipediaMapper.DOCUMENT_COLUMN_FAMILY))));
+      
+    }
+    
+    if (!tops.exists(indexTableName)) {
+      tops.create(indexTableName);
+      // Add the UID aggregator
+      for (IteratorScope scope : IteratorScope.values()) {
+        String stem = String.format("%s%s.%s", Property.TABLE_ITERATOR_PREFIX, scope.name(), "UIDAggregator");
+        tops.setProperty(indexTableName, stem, "19,org.apache.accumulo.examples.wikisearch.iterator.TotalAggregatingIterator");
+        stem += ".opt.";
+        tops.setProperty(indexTableName, stem + "*", "org.apache.accumulo.examples.wikisearch.aggregator.GlobalIndexUidAggregator");
+        
+      }
+    }
+    
+    if (!tops.exists(reverseIndexTableName)) {
+      tops.create(reverseIndexTableName);
+      // Add the UID aggregator
+      for (IteratorScope scope : IteratorScope.values()) {
+        String stem = String.format("%s%s.%s", Property.TABLE_ITERATOR_PREFIX, scope.name(), "UIDAggregator");
+        tops.setProperty(reverseIndexTableName, stem, "19,org.apache.accumulo.examples.wikisearch.iterator.TotalAggregatingIterator");
+        stem += ".opt.";
+        tops.setProperty(reverseIndexTableName, stem + "*", "org.apache.accumulo.examples.wikisearch.aggregator.GlobalIndexUidAggregator");
+        
+      }
+    }
+    
+    if (!tops.exists(metadataTableName)) {
+      // Add the NumSummation aggregator for the frequency column
+      List<AggregatorConfiguration> aggregators = new ArrayList<AggregatorConfiguration>();
+      aggregators.add(new AggregatorConfiguration(new Text("f"), NumSummation.class.getName()));
+      tops.create(metadataTableName);
+      tops.addAggregators(metadataTableName, aggregators);
+    }
+  }
+  
+  @Override
+  public int run(String[] args) throws Exception {
+    Job job = new Job(getConf(), "Ingest Wikipedia");
+    Configuration conf = job.getConfiguration();
+    conf.set("mapred.map.tasks.speculative.execution", "false");
+
+    String tablename = WikipediaConfiguration.getTableName(conf);
+    
+    String zookeepers = WikipediaConfiguration.getZookeepers(conf);
+    String instanceName = WikipediaConfiguration.getInstanceName(conf);
+    
+    String user = WikipediaConfiguration.getUser(conf);
+    byte[] password = WikipediaConfiguration.getPassword(conf);
+    Connector connector = WikipediaConfiguration.getConnector(conf);
+    
+    TableOperations tops = connector.tableOperations();
+    
+    createTables(tops, tablename);
+    
+    configureJob(job);
+    
+    List<Path> inputPaths = new ArrayList<Path>();
+    SortedSet<String> languages = new TreeSet<String>();
+    FileSystem fs = FileSystem.get(conf);
+    Path parent = new Path(conf.get("wikipedia.input"));
+    listFiles(parent, fs, inputPaths, languages);
+    
+    System.out.println("Input files in " + parent + ":" + inputPaths.size());
+    Path[] inputPathsArray = new Path[inputPaths.size()];
+    inputPaths.toArray(inputPathsArray);
+    
+    System.out.println("Languages:" + languages.size());
+    
+    FileInputFormat.setInputPaths(job, inputPathsArray);
+    
+    job.setMapperClass(WikipediaMapper.class);
+    job.setNumReduceTasks(0);
+    job.setMapOutputKeyClass(Text.class);
+    job.setMapOutputValueClass(Mutation.class);
+    job.setOutputFormatClass(AccumuloOutputFormat.class);
+    AccumuloOutputFormat.setOutputInfo(job, user, password, true, tablename);
+    AccumuloOutputFormat.setZooKeeperInstance(job, instanceName, zookeepers);
+    
+    return job.waitForCompletion(true) ? 0 : 1;
+  }
+  
+  public final static PathFilter partFilter = new PathFilter() {
+    @Override
+    public boolean accept(Path path) {
+      return path.getName().startsWith("part");
+    };
+  };
+  
+  protected void configureJob(Job job) {
+    Configuration conf = job.getConfiguration();
+    job.setJarByClass(WikipediaIngester.class);
+    job.setInputFormatClass(WikipediaInputFormat.class);
+    conf.set(AggregatingRecordReader.START_TOKEN, "<page>");
+    conf.set(AggregatingRecordReader.END_TOKEN, "</page>");
+  }
+  
+  protected static final Pattern filePattern = Pattern.compile("([a-z_]+).*.xml(.bz2)?");
+  
+  protected void listFiles(Path path, FileSystem fs, List<Path> files, Set<String> languages) throws IOException {
+    for (FileStatus status : fs.listStatus(path)) {
+      if (status.isDir()) {
+        listFiles(status.getPath(), fs, files, languages);
+      } else {
+        Path p = status.getPath();
+        Matcher matcher = filePattern.matcher(p.getName());
+        if (matcher.matches()) {
+          languages.add(matcher.group(1));
+          files.add(p);
+        }
+      }
+    }
+  }
+}

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaIngester.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputFormat.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputFormat.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputFormat.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputFormat.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.ingest;
+
+import org.apache.accumulo.examples.wikisearch.reader.AggregatingRecordReader;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+
+
+public class WikipediaInputFormat extends TextInputFormat {
+  
+  @Override
+  public RecordReader<LongWritable,Text> createRecordReader(InputSplit split, TaskAttemptContext context) {
+    return new AggregatingRecordReader();
+  }
+  
+  @Override
+  protected boolean isSplitable(JobContext context, Path file) {
+    return false;
+  }
+  
+}

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputFormat.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,258 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * 
+ */
+package org.apache.accumulo.examples.wikisearch.ingest;
+
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.StringReader;
+import java.nio.charset.Charset;
+import java.util.HashSet;
+import java.util.IllegalFormatException;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+
+import org.apache.accumulo.core.data.Mutation;
+import org.apache.accumulo.core.data.Value;
+import org.apache.accumulo.core.security.ColumnVisibility;
+import org.apache.accumulo.examples.wikisearch.ingest.ArticleExtractor.Article;
+import org.apache.accumulo.examples.wikisearch.normalizer.LcNoDiacriticsNormalizer;
+import org.apache.accumulo.examples.wikisearch.protobuf.Uid;
+import org.apache.accumulo.examples.wikisearch.protobuf.Uid.List.Builder;
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.log4j.Logger;
+import org.apache.lucene.analysis.StopAnalyzer;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.ar.ArabicAnalyzer;
+import org.apache.lucene.analysis.br.BrazilianAnalyzer;
+import org.apache.lucene.analysis.cjk.CJKAnalyzer;
+import org.apache.lucene.analysis.de.GermanAnalyzer;
+import org.apache.lucene.analysis.el.GreekAnalyzer;
+import org.apache.lucene.analysis.fa.PersianAnalyzer;
+import org.apache.lucene.analysis.fr.FrenchAnalyzer;
+import org.apache.lucene.analysis.nl.DutchAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.wikipedia.analysis.WikipediaTokenizer;
+
+
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Multimap;
+
+public class WikipediaMapper extends Mapper<LongWritable,Text,Text,Mutation> {
+  
+  private static final Logger log = Logger.getLogger(WikipediaMapper.class);
+  
+  public final static Charset UTF8 = Charset.forName("UTF-8");
+  public static final String DOCUMENT_COLUMN_FAMILY = "d";
+  public static final String METADATA_EVENT_COLUMN_FAMILY = "e";
+  public static final String METADATA_INDEX_COLUMN_FAMILY = "i";
+  public static final String TOKENS_FIELD_NAME = "TEXT";
+  
+  private final static Pattern languagePattern = Pattern.compile("([a-z_]+).*.xml(.bz2)?");
+  private static final Value NULL_VALUE = new Value(new byte[0]);
+  private static final String cvPrefix = "all|";
+  
+  private ArticleExtractor extractor;
+  private String language;
+  private int numPartitions = 0;
+  private Set<?> stopwords = null;
+  private ColumnVisibility cv = null;
+  
+  private Text tablename = null;
+  private Text indexTableName = null;
+  private Text reverseIndexTableName = null;
+  private Text metadataTableName = null;
+  
+  @Override
+  public void setup(Context context) {
+    Configuration conf = context.getConfiguration();
+    tablename = new Text(WikipediaConfiguration.getTableName(conf));
+    indexTableName = new Text(tablename + "Index");
+    reverseIndexTableName = new Text(tablename + "ReverseIndex");
+    metadataTableName = new Text(tablename + "Metadata");
+    
+    FileSplit split = (FileSplit) context.getInputSplit();
+    String fileName = split.getPath().getName();
+    Matcher matcher = languagePattern.matcher(fileName);
+    if (matcher.matches()) {
+      language = matcher.group(1).replace('_', '-').toLowerCase();
+      if (language.equals("arwiki"))
+        stopwords = ArabicAnalyzer.getDefaultStopSet();
+      else if (language.equals("brwiki"))
+        stopwords = BrazilianAnalyzer.getDefaultStopSet();
+      else if (language.startsWith("zh"))
+        stopwords = CJKAnalyzer.getDefaultStopSet();
+      else if (language.equals("dewiki"))
+        stopwords = GermanAnalyzer.getDefaultStopSet();
+      else if (language.equals("elwiki"))
+        stopwords = GreekAnalyzer.getDefaultStopSet();
+      else if (language.equals("fawiki"))
+        stopwords = PersianAnalyzer.getDefaultStopSet();
+      else if (language.equals("frwiki"))
+        stopwords = FrenchAnalyzer.getDefaultStopSet();
+      else if (language.equals("nlwiki"))
+        stopwords = DutchAnalyzer.getDefaultStopSet();
+      else
+        stopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
+      
+    } else {
+      throw new RuntimeException("Unknown ingest language! " + fileName);
+    }
+    extractor = new ArticleExtractor();
+    numPartitions = WikipediaConfiguration.getNumPartitions(conf);
+    cv = new ColumnVisibility(cvPrefix + language);
+    
+  }
+  
+  /**
+   * We will partition the documents based on the document id
+   * 
+   * @param article
+   * @param numPartitions
+   * @return
+   * @throws IllegalFormatException
+   */
+  public static int getPartitionId(Article article, int numPartitions) throws IllegalFormatException {
+    return article.getId() % numPartitions;
+  }
+  
+  @Override
+  protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+    Article article = extractor.extract(new InputStreamReader(new ByteArrayInputStream(value.getBytes()), UTF8));
+    String NULL_BYTE = "\u0000";
+    String colfPrefix = language + NULL_BYTE;
+    String indexPrefix = "fi" + NULL_BYTE;
+    if (article != null) {
+      Text partitionId = new Text(Integer.toString(WikipediaMapper.getPartitionId(article, numPartitions)));
+      
+      // Create the mutations for the document.
+      // Row is partition id, colf is language0articleid, colq is fieldName\0fieldValue
+      Mutation m = new Mutation(partitionId);
+      for (Entry<String,Object> entry : article.getFieldValues().entrySet()) {
+        m.put(colfPrefix + article.getId(), entry.getKey() + NULL_BYTE + entry.getValue().toString(), cv, article.getTimestamp(), NULL_VALUE);
+        // Create mutations for the metadata table.
+        Mutation mm = new Mutation(entry.getKey());
+        mm.put(METADATA_EVENT_COLUMN_FAMILY, language, cv, article.getTimestamp(), NULL_VALUE);
+        context.write(metadataTableName, mm);
+      }
+      
+      // Tokenize the content
+      Set<String> tokens = getTokens(article);
+      
+      // We are going to put the fields to be indexed into a multimap. This allows us to iterate
+      // over the entire set once.
+      Multimap<String,String> indexFields = HashMultimap.create();
+      // Add the normalized field values
+      LcNoDiacriticsNormalizer normalizer = new LcNoDiacriticsNormalizer();
+      for (Entry<String,String> index : article.getNormalizedFieldValues().entrySet())
+        indexFields.put(index.getKey(), index.getValue());
+      // Add the tokens
+      for (String token : tokens)
+        indexFields.put(TOKENS_FIELD_NAME, normalizer.normalizeFieldValue("", token));
+      
+      for (Entry<String,String> index : indexFields.entries()) {
+        // Create mutations for the in partition index
+        // Row is partition id, colf is 'fi'\0fieldName, colq is fieldValue\0language\0article id
+        m.put(indexPrefix + index.getKey(), index.getValue() + NULL_BYTE + colfPrefix + article.getId(), cv, article.getTimestamp(), NULL_VALUE);
+        
+        // Create mutations for the global index
+        // Create a UID object for the Value
+        Builder uidBuilder = Uid.List.newBuilder();
+        uidBuilder.setIGNORE(false);
+        uidBuilder.setCOUNT(1);
+        uidBuilder.addUID(Integer.toString(article.getId()));
+        Uid.List uidList = uidBuilder.build();
+        Value val = new Value(uidList.toByteArray());
+        
+        // Create mutations for the global index
+        // Row is field value, colf is field name, colq is partitionid\0language, value is Uid.List object
+        Mutation gm = new Mutation(index.getValue());
+        gm.put(index.getKey(), partitionId + NULL_BYTE + language, cv, article.getTimestamp(), val);
+        context.write(indexTableName, gm);
+        
+        // Create mutations for the global reverse index
+        Mutation grm = new Mutation(StringUtils.reverse(index.getValue()));
+        grm.put(index.getKey(), partitionId + NULL_BYTE + language, cv, article.getTimestamp(), val);
+        context.write(reverseIndexTableName, grm);
+        
+        // Create mutations for the metadata table.
+        Mutation mm = new Mutation(index.getKey());
+        mm.put(METADATA_INDEX_COLUMN_FAMILY, language + NULL_BYTE + LcNoDiacriticsNormalizer.class.getName(), cv, article.getTimestamp(), NULL_VALUE);
+        context.write(metadataTableName, mm);
+        
+      }
+      // Add the entire text to the document section of the table.
+      // row is the partition, colf is 'd', colq is language\0articleid, value is Base64 encoded GZIP'd document
+      m.put(DOCUMENT_COLUMN_FAMILY, colfPrefix + article.getId(), cv, article.getTimestamp(), new Value(Base64.encodeBase64(article.getText().getBytes())));
+      context.write(tablename, m);
+      
+    } else {
+      context.getCounter("wikipedia", "invalid articles").increment(1);
+    }
+    context.progress();
+  }
+  
+  /**
+   * Tokenize the wikipedia content
+   * 
+   * @param article
+   * @return
+   * @throws IOException
+   */
+  private Set<String> getTokens(Article article) throws IOException {
+    Set<String> tokenList = new HashSet<String>();
+    WikipediaTokenizer tok = new WikipediaTokenizer(new StringReader(article.getText()));
+    TermAttribute term = tok.addAttribute(TermAttribute.class);
+    StopFilter filter = new StopFilter(false, tok, stopwords, true);
+    try {
+      while (filter.incrementToken()) {
+        String token = term.term();
+        if (!StringUtils.isEmpty(token))
+          tokenList.add(token);
+      }
+    } catch (IOException e) {
+      log.error("Error tokenizing text", e);
+    } finally {
+      try {
+        tok.end();
+      } catch (IOException e) {
+        log.error("Error calling end()", e);
+      } finally {
+        try {
+          tok.close();
+        } catch (IOException e) {
+          log.error("Error closing tokenizer", e);
+        }
+      }
+    }
+    return tokenList;
+  }
+  
+}

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/TotalAggregatingIterator.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/TotalAggregatingIterator.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/TotalAggregatingIterator.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/TotalAggregatingIterator.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.iterator;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.accumulo.core.data.ByteSequence;
+import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.PartialKey;
+import org.apache.accumulo.core.data.Range;
+import org.apache.accumulo.core.data.Value;
+import org.apache.accumulo.core.iterators.IteratorEnvironment;
+import org.apache.accumulo.core.iterators.OptionDescriber;
+import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
+import org.apache.accumulo.core.iterators.aggregation.Aggregator;
+import org.apache.accumulo.core.iterators.conf.ColumnToClassMapping;
+import org.apache.accumulo.start.classloader.AccumuloClassLoader;
+
+/**
+ * Aggregate all values with the same key (row, colf, colq, colVis.).
+ * 
+ */
+
+public class TotalAggregatingIterator implements SortedKeyValueIterator<Key,Value>, OptionDescriber {
+  
+  private SortedKeyValueIterator<Key,Value> iterator;
+  
+  private Key workKey = new Key();
+  
+  private Key aggrKey;
+  private Value aggrValue;
+  
+  private Aggregator agg;
+  
+  public TotalAggregatingIterator deepCopy(IteratorEnvironment env) {
+    return new TotalAggregatingIterator(this, env);
+  }
+  
+  private TotalAggregatingIterator(TotalAggregatingIterator other, IteratorEnvironment env) {
+    iterator = other.iterator.deepCopy(env);
+    agg = other.agg;
+  }
+  
+  public TotalAggregatingIterator() {}
+  
+  private void aggregateRowColumn(Aggregator aggr) throws IOException {
+    // this function assumes that first value is not delete
+    
+    workKey.set(iterator.getTopKey());
+    
+    Key keyToAggregate = workKey;
+    
+    aggr.reset();
+    
+    aggr.collect(iterator.getTopValue());
+    iterator.next();
+    
+    while (iterator.hasTop() && iterator.getTopKey().equals(keyToAggregate, PartialKey.ROW_COLFAM_COLQUAL_COLVIS)) {
+      aggr.collect(iterator.getTopValue());
+      iterator.next();
+    }
+    
+    aggrKey = workKey;
+    aggrValue = aggr.aggregate();
+    
+  }
+  
+  private void findTop() throws IOException {
+    // check if aggregation is needed
+    if (iterator.hasTop()) {
+      aggregateRowColumn(agg);
+    }
+  }
+  
+  public TotalAggregatingIterator(SortedKeyValueIterator<Key,Value> iterator, ColumnToClassMapping<Aggregator> aggregators) throws IOException {
+    this.iterator = iterator;
+  }
+  
+  @Override
+  public Key getTopKey() {
+    if (aggrKey != null) {
+      return aggrKey;
+    }
+    return iterator.getTopKey();
+  }
+  
+  @Override
+  public Value getTopValue() {
+    if (aggrKey != null) {
+      return aggrValue;
+    }
+    return iterator.getTopValue();
+  }
+  
+  @Override
+  public boolean hasTop() {
+    return aggrKey != null || iterator.hasTop();
+  }
+  
+  @Override
+  public void next() throws IOException {
+    if (aggrKey != null) {
+      aggrKey = null;
+      aggrValue = null;
+    } else {
+      iterator.next();
+    }
+    
+    findTop();
+  }
+  
+  @Override
+  public void seek(Range range, Collection<ByteSequence> columnFamilies, boolean inclusive) throws IOException {
+    // do not want to seek to the middle of a value that should be
+    // aggregated...
+    
+    Range seekRange = maximizeStartKeyTimeStamp(range);
+    
+    iterator.seek(seekRange, columnFamilies, inclusive);
+    findTop();
+    
+    if (range.getStartKey() != null) {
+      while (hasTop() && getTopKey().equals(range.getStartKey(), PartialKey.ROW_COLFAM_COLQUAL_COLVIS)
+          && getTopKey().getTimestamp() > range.getStartKey().getTimestamp()) {
+        // the value has a more recent time stamp, so
+        // pass it up
+        // log.debug("skipping "+getTopKey());
+        next();
+      }
+      
+      while (hasTop() && range.beforeStartKey(getTopKey())) {
+        next();
+      }
+    }
+    
+  }
+  
+  @Override
+  public void init(SortedKeyValueIterator<Key,Value> source, Map<String,String> options, IteratorEnvironment env) throws IOException {
+    agg = createAggregator(options);
+    this.iterator = source;
+  }
+  
+  @Override
+  public IteratorOptions describeOptions() {
+    return new IteratorOptions("agg", "Aggregators apply aggregating functions to values with identical keys", null,
+        Collections.singletonList("* <aggregatorClass>"));
+  }
+  
+  @Override
+  public boolean validateOptions(Map<String,String> options) {
+    if (options.size() > 1)
+      throw new IllegalArgumentException("This iterator only accepts one configuration option, the name of the aggregating class");
+    agg = createAggregator(options);
+    return true;
+  }
+  
+  private Aggregator createAggregator(Map<String,String> options) {
+    Aggregator a = null;
+    for (Entry<String,String> entry : options.entrySet()) {
+      try {
+        Class<? extends Aggregator> clazz = AccumuloClassLoader.loadClass(entry.getValue(), Aggregator.class);
+        a = clazz.newInstance();
+      } catch (ClassNotFoundException e) {
+        throw new IllegalArgumentException("class not found: " + entry.getValue());
+      } catch (InstantiationException e) {
+        throw new IllegalArgumentException("instantiation exception: " + entry.getValue());
+      } catch (IllegalAccessException e) {
+        throw new IllegalArgumentException("illegal access exception: " + entry.getValue());
+      }
+    }
+    return a;
+  }
+  
+  static Range maximizeStartKeyTimeStamp(Range range) {
+    Range seekRange = range;
+    
+    if (range.getStartKey() != null && range.getStartKey().getTimestamp() != Long.MAX_VALUE) {
+      Key seekKey = new Key(seekRange.getStartKey());
+      seekKey.setTimestamp(Long.MAX_VALUE);
+      seekRange = new Range(seekKey, true, range.getEndKey(), range.isEndKeyInclusive());
+    }
+    
+    return seekRange;
+  }
+}

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/TotalAggregatingIterator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/LcNoDiacriticsNormalizer.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/LcNoDiacriticsNormalizer.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/LcNoDiacriticsNormalizer.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/LcNoDiacriticsNormalizer.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.normalizer;
+
+import java.text.Normalizer;
+import java.text.Normalizer.Form;
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * An {@link Normalizer} which performs the following steps:
+ * <ol>
+ * <li>Unicode canonical decomposition ({@link Form#NFD})</li>
+ * <li>Removal of diacritical marks</li>
+ * <li>Unicode canonical composition ({@link Form#NFC})</li>
+ * <li>lower casing in the {@link Locale#ENGLISH English local}
+ * </ol>
+ */
+public class LcNoDiacriticsNormalizer implements org.apache.accumulo.examples.wikisearch.normalizer.Normalizer {
+  private static final Pattern diacriticals = Pattern.compile("\\p{InCombiningDiacriticalMarks}");
+  
+  public String normalizeFieldValue(String fieldName, Object fieldValue) {
+    String decomposed = Normalizer.normalize(fieldValue.toString(), Form.NFD);
+    String noDiacriticals = removeDiacriticalMarks(decomposed);
+    String recomposed = Normalizer.normalize(noDiacriticals, Form.NFC);
+    return recomposed.toLowerCase(Locale.ENGLISH);
+  }
+  
+  private String removeDiacriticalMarks(String str) {
+    Matcher matcher = diacriticals.matcher(str);
+    return matcher.replaceAll("");
+  }
+  
+}

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/LcNoDiacriticsNormalizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NoOpNormalizer.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NoOpNormalizer.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NoOpNormalizer.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NoOpNormalizer.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.normalizer;
+
+public class NoOpNormalizer implements Normalizer {
+  public String normalizeFieldValue(String field, Object value) {
+    return value.toString();
+  }
+}

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NoOpNormalizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/Normalizer.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/Normalizer.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/Normalizer.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/Normalizer.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.normalizer;
+
+public interface Normalizer {
+  
+  /**
+   * Creates normalized content for ingest based upon implemented logic.
+   * 
+   * @param field
+   *          The field being normalized
+   * @param value
+   *          The value to normalize
+   * @return a normalized value
+   */
+  public String normalizeFieldValue(String field, Object value);
+  
+}

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/Normalizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.normalizer;
+
+import org.apache.commons.lang.math.NumberUtils;
+import org.apache.lucene.util.NumericUtils;
+
+public class NumberNormalizer implements Normalizer {
+  
+  public String normalizeFieldValue(String field, Object value) {
+    if (NumberUtils.isNumber(value.toString())) {
+      Number n = NumberUtils.createNumber(value.toString());
+      if (n instanceof Integer)
+        return NumericUtils.intToPrefixCoded((Integer) n);
+      else if (n instanceof Long)
+        return NumericUtils.longToPrefixCoded((Long) n);
+      else if (n instanceof Float)
+        return NumericUtils.floatToPrefixCoded((Float) n);
+      else if (n instanceof Double)
+        return NumericUtils.doubleToPrefixCoded((Double) n);
+      else
+        throw new IllegalArgumentException("Unhandled numeric type: " + n.getClass());
+    } else {
+      throw new IllegalArgumentException("Value is not a number: " + value);
+    }
+  }
+  
+}

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/TermWeight.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/TermWeight.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/TermWeight.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/TermWeight.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,424 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: TermWeight.proto
+
+package org.apache.accumulo.examples.wikisearch.protobuf;
+
+public final class TermWeight {
+  private TermWeight() {}
+  
+  public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {}
+  
+  public static final class Info extends com.google.protobuf.GeneratedMessage {
+    // Use Info.newBuilder() to construct.
+    private Info() {
+      initFields();
+    }
+    
+    private Info(boolean noInit) {}
+    
+    private static final Info defaultInstance;
+    
+    public static Info getDefaultInstance() {
+      return defaultInstance;
+    }
+    
+    public Info getDefaultInstanceForType() {
+      return defaultInstance;
+    }
+    
+    public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() {
+      return org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.internal_static_protobuf_Info_descriptor;
+    }
+    
+    protected com.google.protobuf.GeneratedMessage.FieldAccessorTable internalGetFieldAccessorTable() {
+      return org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.internal_static_protobuf_Info_fieldAccessorTable;
+    }
+    
+    // required float normalizedTermFrequency = 1;
+    public static final int NORMALIZEDTERMFREQUENCY_FIELD_NUMBER = 1;
+    private boolean hasNormalizedTermFrequency;
+    private float normalizedTermFrequency_ = 0F;
+    
+    public boolean hasNormalizedTermFrequency() {
+      return hasNormalizedTermFrequency;
+    }
+    
+    public float getNormalizedTermFrequency() {
+      return normalizedTermFrequency_;
+    }
+    
+    // repeated uint32 wordOffset = 2;
+    public static final int WORDOFFSET_FIELD_NUMBER = 2;
+    private java.util.List<java.lang.Integer> wordOffset_ = java.util.Collections.emptyList();
+    
+    public java.util.List<java.lang.Integer> getWordOffsetList() {
+      return wordOffset_;
+    }
+    
+    public int getWordOffsetCount() {
+      return wordOffset_.size();
+    }
+    
+    public int getWordOffset(int index) {
+      return wordOffset_.get(index);
+    }
+    
+    private void initFields() {}
+    
+    public final boolean isInitialized() {
+      if (!hasNormalizedTermFrequency)
+        return false;
+      return true;
+    }
+    
+    public void writeTo(com.google.protobuf.CodedOutputStream output) throws java.io.IOException {
+      getSerializedSize();
+      if (hasNormalizedTermFrequency()) {
+        output.writeFloat(1, getNormalizedTermFrequency());
+      }
+      for (int element : getWordOffsetList()) {
+        output.writeUInt32(2, element);
+      }
+      getUnknownFields().writeTo(output);
+    }
+    
+    private int memoizedSerializedSize = -1;
+    
+    public int getSerializedSize() {
+      int size = memoizedSerializedSize;
+      if (size != -1)
+        return size;
+      
+      size = 0;
+      if (hasNormalizedTermFrequency()) {
+        size += com.google.protobuf.CodedOutputStream.computeFloatSize(1, getNormalizedTermFrequency());
+      }
+      {
+        int dataSize = 0;
+        for (int element : getWordOffsetList()) {
+          dataSize += com.google.protobuf.CodedOutputStream.computeUInt32SizeNoTag(element);
+        }
+        size += dataSize;
+        size += 1 * getWordOffsetList().size();
+      }
+      size += getUnknownFields().getSerializedSize();
+      memoizedSerializedSize = size;
+      return size;
+    }
+    
+    public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(com.google.protobuf.ByteString data) throws com.google.protobuf.InvalidProtocolBufferException {
+      return newBuilder().mergeFrom(data).buildParsed();
+    }
+    
+    public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(com.google.protobuf.ByteString data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+        throws com.google.protobuf.InvalidProtocolBufferException {
+      return newBuilder().mergeFrom(data, extensionRegistry).buildParsed();
+    }
+    
+    public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(byte[] data) throws com.google.protobuf.InvalidProtocolBufferException {
+      return newBuilder().mergeFrom(data).buildParsed();
+    }
+    
+    public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(byte[] data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+        throws com.google.protobuf.InvalidProtocolBufferException {
+      return newBuilder().mergeFrom(data, extensionRegistry).buildParsed();
+    }
+    
+    public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(java.io.InputStream input) throws java.io.IOException {
+      return newBuilder().mergeFrom(input).buildParsed();
+    }
+    
+    public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+        throws java.io.IOException {
+      return newBuilder().mergeFrom(input, extensionRegistry).buildParsed();
+    }
+    
+    public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseDelimitedFrom(java.io.InputStream input) throws java.io.IOException {
+      Builder builder = newBuilder();
+      if (builder.mergeDelimitedFrom(input)) {
+        return builder.buildParsed();
+      } else {
+        return null;
+      }
+    }
+    
+    public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseDelimitedFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+        throws java.io.IOException {
+      Builder builder = newBuilder();
+      if (builder.mergeDelimitedFrom(input, extensionRegistry)) {
+        return builder.buildParsed();
+      } else {
+        return null;
+      }
+    }
+    
+    public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(com.google.protobuf.CodedInputStream input) throws java.io.IOException {
+      return newBuilder().mergeFrom(input).buildParsed();
+    }
+    
+    public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+        throws java.io.IOException {
+      return newBuilder().mergeFrom(input, extensionRegistry).buildParsed();
+    }
+    
+    public static Builder newBuilder() {
+      return Builder.create();
+    }
+    
+    public Builder newBuilderForType() {
+      return newBuilder();
+    }
+    
+    public static Builder newBuilder(org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info prototype) {
+      return newBuilder().mergeFrom(prototype);
+    }
+    
+    public Builder toBuilder() {
+      return newBuilder(this);
+    }
+    
+    public static final class Builder extends com.google.protobuf.GeneratedMessage.Builder<Builder> {
+      private org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info result;
+      
+      // Construct using protobuf.TermWeight.Info.newBuilder()
+      private Builder() {}
+      
+      private static Builder create() {
+        Builder builder = new Builder();
+        builder.result = new org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info();
+        return builder;
+      }
+      
+      protected org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info internalGetResult() {
+        return result;
+      }
+      
+      public Builder clear() {
+        if (result == null) {
+          throw new IllegalStateException("Cannot call clear() after build().");
+        }
+        result = new org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info();
+        return this;
+      }
+      
+      public Builder clone() {
+        return create().mergeFrom(result);
+      }
+      
+      public com.google.protobuf.Descriptors.Descriptor getDescriptorForType() {
+        return org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info.getDescriptor();
+      }
+      
+      public org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info getDefaultInstanceForType() {
+        return org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info.getDefaultInstance();
+      }
+      
+      public boolean isInitialized() {
+        return result.isInitialized();
+      }
+      
+      public org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info build() {
+        if (result != null && !isInitialized()) {
+          throw newUninitializedMessageException(result);
+        }
+        return buildPartial();
+      }
+      
+      private org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info buildParsed() throws com.google.protobuf.InvalidProtocolBufferException {
+        if (!isInitialized()) {
+          throw newUninitializedMessageException(result).asInvalidProtocolBufferException();
+        }
+        return buildPartial();
+      }
+      
+      public org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info buildPartial() {
+        if (result == null) {
+          throw new IllegalStateException("build() has already been called on this Builder.");
+        }
+        if (result.wordOffset_ != java.util.Collections.EMPTY_LIST) {
+          result.wordOffset_ = java.util.Collections.unmodifiableList(result.wordOffset_);
+        }
+        org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info returnMe = result;
+        result = null;
+        return returnMe;
+      }
+      
+      public Builder mergeFrom(com.google.protobuf.Message other) {
+        if (other instanceof org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info) {
+          return mergeFrom((org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info) other);
+        } else {
+          super.mergeFrom(other);
+          return this;
+        }
+      }
+      
+      public Builder mergeFrom(org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info other) {
+        if (other == org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info.getDefaultInstance())
+          return this;
+        if (other.hasNormalizedTermFrequency()) {
+          setNormalizedTermFrequency(other.getNormalizedTermFrequency());
+        }
+        if (!other.wordOffset_.isEmpty()) {
+          if (result.wordOffset_.isEmpty()) {
+            result.wordOffset_ = new java.util.ArrayList<java.lang.Integer>();
+          }
+          result.wordOffset_.addAll(other.wordOffset_);
+        }
+        this.mergeUnknownFields(other.getUnknownFields());
+        return this;
+      }
+      
+      public Builder mergeFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+          throws java.io.IOException {
+        com.google.protobuf.UnknownFieldSet.Builder unknownFields = com.google.protobuf.UnknownFieldSet.newBuilder(this.getUnknownFields());
+        while (true) {
+          int tag = input.readTag();
+          switch (tag) {
+            case 0:
+              this.setUnknownFields(unknownFields.build());
+              return this;
+            default: {
+              if (!parseUnknownField(input, unknownFields, extensionRegistry, tag)) {
+                this.setUnknownFields(unknownFields.build());
+                return this;
+              }
+              break;
+            }
+            case 13: {
+              setNormalizedTermFrequency(input.readFloat());
+              break;
+            }
+            case 16: {
+              addWordOffset(input.readUInt32());
+              break;
+            }
+            case 18: {
+              int length = input.readRawVarint32();
+              int limit = input.pushLimit(length);
+              while (input.getBytesUntilLimit() > 0) {
+                addWordOffset(input.readUInt32());
+              }
+              input.popLimit(limit);
+              break;
+            }
+          }
+        }
+      }
+      
+      // required float normalizedTermFrequency = 1;
+      public boolean hasNormalizedTermFrequency() {
+        return result.hasNormalizedTermFrequency();
+      }
+      
+      public float getNormalizedTermFrequency() {
+        return result.getNormalizedTermFrequency();
+      }
+      
+      public Builder setNormalizedTermFrequency(float value) {
+        result.hasNormalizedTermFrequency = true;
+        result.normalizedTermFrequency_ = value;
+        return this;
+      }
+      
+      public Builder clearNormalizedTermFrequency() {
+        result.hasNormalizedTermFrequency = false;
+        result.normalizedTermFrequency_ = 0F;
+        return this;
+      }
+      
+      // repeated uint32 wordOffset = 2;
+      public java.util.List<java.lang.Integer> getWordOffsetList() {
+        return java.util.Collections.unmodifiableList(result.wordOffset_);
+      }
+      
+      public int getWordOffsetCount() {
+        return result.getWordOffsetCount();
+      }
+      
+      public int getWordOffset(int index) {
+        return result.getWordOffset(index);
+      }
+      
+      public Builder setWordOffset(int index, int value) {
+        result.wordOffset_.set(index, value);
+        return this;
+      }
+      
+      public Builder addWordOffset(int value) {
+        if (result.wordOffset_.isEmpty()) {
+          result.wordOffset_ = new java.util.ArrayList<java.lang.Integer>();
+        }
+        result.wordOffset_.add(value);
+        return this;
+      }
+      
+      public Builder addAllWordOffset(java.lang.Iterable<? extends java.lang.Integer> values) {
+        if (result.wordOffset_.isEmpty()) {
+          result.wordOffset_ = new java.util.ArrayList<java.lang.Integer>();
+        }
+        super.addAll(values, result.wordOffset_);
+        return this;
+      }
+      
+      public Builder clearWordOffset() {
+        result.wordOffset_ = java.util.Collections.emptyList();
+        return this;
+      }
+      
+      // @@protoc_insertion_point(builder_scope:protobuf.Info)
+    }
+    
+    static {
+      defaultInstance = new Info(true);
+      org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.internalForceInit();
+      defaultInstance.initFields();
+    }
+    
+    // @@protoc_insertion_point(class_scope:protobuf.Info)
+  }
+  
+  private static com.google.protobuf.Descriptors.Descriptor internal_static_protobuf_Info_descriptor;
+  private static com.google.protobuf.GeneratedMessage.FieldAccessorTable internal_static_protobuf_Info_fieldAccessorTable;
+  
+  public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
+    return descriptor;
+  }
+  
+  private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
+  static {
+    java.lang.String[] descriptorData = {"\n\020TermWeight.proto\022\010protobuf\";\n\004Info\022\037\n\027"
+        + "normalizedTermFrequency\030\001 \002(\002\022\022\n\nwordOff" + "set\030\002 \003(\rB\014\n\010protobufH\001"};
+    com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner = new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() {
+      public com.google.protobuf.ExtensionRegistry assignDescriptors(com.google.protobuf.Descriptors.FileDescriptor root) {
+        descriptor = root;
+        internal_static_protobuf_Info_descriptor = getDescriptor().getMessageTypes().get(0);
+        internal_static_protobuf_Info_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable(
+            internal_static_protobuf_Info_descriptor, new java.lang.String[] {"NormalizedTermFrequency", "WordOffset",}, org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info.class,
+            org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info.Builder.class);
+        return null;
+      }
+    };
+    com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[] {},
+        assigner);
+  }
+  
+  public static void internalForceInit() {}
+  
+  // @@protoc_insertion_point(outer_class_scope)
+}

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/TermWeight.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/Uid.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/Uid.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/Uid.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/Uid.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,470 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: Uid.proto
+
+package org.apache.accumulo.examples.wikisearch.protobuf;
+
+public final class Uid {
+  private Uid() {}
+  
+  public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {}
+  
+  public static final class List extends com.google.protobuf.GeneratedMessage {
+    // Use List.newBuilder() to construct.
+    private List() {
+      initFields();
+    }
+    
+    private List(boolean noInit) {}
+    
+    private static final List defaultInstance;
+    
+    public static List getDefaultInstance() {
+      return defaultInstance;
+    }
+    
+    public List getDefaultInstanceForType() {
+      return defaultInstance;
+    }
+    
+    public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() {
+      return org.apache.accumulo.examples.wikisearch.protobuf.Uid.internal_static_protobuf_List_descriptor;
+    }
+    
+    protected com.google.protobuf.GeneratedMessage.FieldAccessorTable internalGetFieldAccessorTable() {
+      return org.apache.accumulo.examples.wikisearch.protobuf.Uid.internal_static_protobuf_List_fieldAccessorTable;
+    }
+    
+    // required bool IGNORE = 1;
+    public static final int IGNORE_FIELD_NUMBER = 1;
+    private boolean hasIGNORE;
+    private boolean iGNORE_ = false;
+    
+    public boolean hasIGNORE() {
+      return hasIGNORE;
+    }
+    
+    public boolean getIGNORE() {
+      return iGNORE_;
+    }
+    
+    // required uint64 COUNT = 2;
+    public static final int COUNT_FIELD_NUMBER = 2;
+    private boolean hasCOUNT;
+    private long cOUNT_ = 0L;
+    
+    public boolean hasCOUNT() {
+      return hasCOUNT;
+    }
+    
+    public long getCOUNT() {
+      return cOUNT_;
+    }
+    
+    // repeated string UID = 3;
+    public static final int UID_FIELD_NUMBER = 3;
+    private java.util.List<java.lang.String> uID_ = java.util.Collections.emptyList();
+    
+    public java.util.List<java.lang.String> getUIDList() {
+      return uID_;
+    }
+    
+    public int getUIDCount() {
+      return uID_.size();
+    }
+    
+    public java.lang.String getUID(int index) {
+      return uID_.get(index);
+    }
+    
+    private void initFields() {}
+    
+    public final boolean isInitialized() {
+      if (!hasIGNORE)
+        return false;
+      if (!hasCOUNT)
+        return false;
+      return true;
+    }
+    
+    public void writeTo(com.google.protobuf.CodedOutputStream output) throws java.io.IOException {
+      getSerializedSize();
+      if (hasIGNORE()) {
+        output.writeBool(1, getIGNORE());
+      }
+      if (hasCOUNT()) {
+        output.writeUInt64(2, getCOUNT());
+      }
+      for (java.lang.String element : getUIDList()) {
+        output.writeString(3, element);
+      }
+      getUnknownFields().writeTo(output);
+    }
+    
+    private int memoizedSerializedSize = -1;
+    
+    public int getSerializedSize() {
+      int size = memoizedSerializedSize;
+      if (size != -1)
+        return size;
+      
+      size = 0;
+      if (hasIGNORE()) {
+        size += com.google.protobuf.CodedOutputStream.computeBoolSize(1, getIGNORE());
+      }
+      if (hasCOUNT()) {
+        size += com.google.protobuf.CodedOutputStream.computeUInt64Size(2, getCOUNT());
+      }
+      {
+        int dataSize = 0;
+        for (java.lang.String element : getUIDList()) {
+          dataSize += com.google.protobuf.CodedOutputStream.computeStringSizeNoTag(element);
+        }
+        size += dataSize;
+        size += 1 * getUIDList().size();
+      }
+      size += getUnknownFields().getSerializedSize();
+      memoizedSerializedSize = size;
+      return size;
+    }
+    
+    public static org.apache.accumulo.examples.wikisearch.protobuf.Uid.List parseFrom(com.google.protobuf.ByteString data) throws com.google.protobuf.InvalidProtocolBufferException {
+      return newBuilder().mergeFrom(data).buildParsed();
+    }
+    
+    public static org.apache.accumulo.examples.wikisearch.protobuf.Uid.List parseFrom(com.google.protobuf.ByteString data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+        throws com.google.protobuf.InvalidProtocolBufferException {
+      return newBuilder().mergeFrom(data, extensionRegistry).buildParsed();
+    }
+    
+    public static org.apache.accumulo.examples.wikisearch.protobuf.Uid.List parseFrom(byte[] data) throws com.google.protobuf.InvalidProtocolBufferException {
+      return newBuilder().mergeFrom(data).buildParsed();
+    }
+    
+    public static org.apache.accumulo.examples.wikisearch.protobuf.Uid.List parseFrom(byte[] data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+        throws com.google.protobuf.InvalidProtocolBufferException {
+      return newBuilder().mergeFrom(data, extensionRegistry).buildParsed();
+    }
+    
+    public static org.apache.accumulo.examples.wikisearch.protobuf.Uid.List parseFrom(java.io.InputStream input) throws java.io.IOException {
+      return newBuilder().mergeFrom(input).buildParsed();
+    }
+    
+    public static org.apache.accumulo.examples.wikisearch.protobuf.Uid.List parseFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+        throws java.io.IOException {
+      return newBuilder().mergeFrom(input, extensionRegistry).buildParsed();
+    }
+    
+    public static org.apache.accumulo.examples.wikisearch.protobuf.Uid.List parseDelimitedFrom(java.io.InputStream input) throws java.io.IOException {
+      Builder builder = newBuilder();
+      if (builder.mergeDelimitedFrom(input)) {
+        return builder.buildParsed();
+      } else {
+        return null;
+      }
+    }
+    
+    public static org.apache.accumulo.examples.wikisearch.protobuf.Uid.List parseDelimitedFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+        throws java.io.IOException {
+      Builder builder = newBuilder();
+      if (builder.mergeDelimitedFrom(input, extensionRegistry)) {
+        return builder.buildParsed();
+      } else {
+        return null;
+      }
+    }
+    
+    public static org.apache.accumulo.examples.wikisearch.protobuf.Uid.List parseFrom(com.google.protobuf.CodedInputStream input) throws java.io.IOException {
+      return newBuilder().mergeFrom(input).buildParsed();
+    }
+    
+    public static org.apache.accumulo.examples.wikisearch.protobuf.Uid.List parseFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+        throws java.io.IOException {
+      return newBuilder().mergeFrom(input, extensionRegistry).buildParsed();
+    }
+    
+    public static Builder newBuilder() {
+      return Builder.create();
+    }
+    
+    public Builder newBuilderForType() {
+      return newBuilder();
+    }
+    
+    public static Builder newBuilder(org.apache.accumulo.examples.wikisearch.protobuf.Uid.List prototype) {
+      return newBuilder().mergeFrom(prototype);
+    }
+    
+    public Builder toBuilder() {
+      return newBuilder(this);
+    }
+    
+    public static final class Builder extends com.google.protobuf.GeneratedMessage.Builder<Builder> {
+      private org.apache.accumulo.examples.wikisearch.protobuf.Uid.List result;
+      
+      // Construct using protobuf.Uid.List.newBuilder()
+      private Builder() {}
+      
+      private static Builder create() {
+        Builder builder = new Builder();
+        builder.result = new org.apache.accumulo.examples.wikisearch.protobuf.Uid.List();
+        return builder;
+      }
+      
+      protected org.apache.accumulo.examples.wikisearch.protobuf.Uid.List internalGetResult() {
+        return result;
+      }
+      
+      public Builder clear() {
+        if (result == null) {
+          throw new IllegalStateException("Cannot call clear() after build().");
+        }
+        result = new org.apache.accumulo.examples.wikisearch.protobuf.Uid.List();
+        return this;
+      }
+      
+      public Builder clone() {
+        return create().mergeFrom(result);
+      }
+      
+      public com.google.protobuf.Descriptors.Descriptor getDescriptorForType() {
+        return org.apache.accumulo.examples.wikisearch.protobuf.Uid.List.getDescriptor();
+      }
+      
+      public org.apache.accumulo.examples.wikisearch.protobuf.Uid.List getDefaultInstanceForType() {
+        return org.apache.accumulo.examples.wikisearch.protobuf.Uid.List.getDefaultInstance();
+      }
+      
+      public boolean isInitialized() {
+        return result.isInitialized();
+      }
+      
+      public org.apache.accumulo.examples.wikisearch.protobuf.Uid.List build() {
+        if (result != null && !isInitialized()) {
+          throw newUninitializedMessageException(result);
+        }
+        return buildPartial();
+      }
+      
+      private org.apache.accumulo.examples.wikisearch.protobuf.Uid.List buildParsed() throws com.google.protobuf.InvalidProtocolBufferException {
+        if (!isInitialized()) {
+          throw newUninitializedMessageException(result).asInvalidProtocolBufferException();
+        }
+        return buildPartial();
+      }
+      
+      public org.apache.accumulo.examples.wikisearch.protobuf.Uid.List buildPartial() {
+        if (result == null) {
+          throw new IllegalStateException("build() has already been called on this Builder.");
+        }
+        if (result.uID_ != java.util.Collections.EMPTY_LIST) {
+          result.uID_ = java.util.Collections.unmodifiableList(result.uID_);
+        }
+        org.apache.accumulo.examples.wikisearch.protobuf.Uid.List returnMe = result;
+        result = null;
+        return returnMe;
+      }
+      
+      public Builder mergeFrom(com.google.protobuf.Message other) {
+        if (other instanceof org.apache.accumulo.examples.wikisearch.protobuf.Uid.List) {
+          return mergeFrom((org.apache.accumulo.examples.wikisearch.protobuf.Uid.List) other);
+        } else {
+          super.mergeFrom(other);
+          return this;
+        }
+      }
+      
+      public Builder mergeFrom(org.apache.accumulo.examples.wikisearch.protobuf.Uid.List other) {
+        if (other == org.apache.accumulo.examples.wikisearch.protobuf.Uid.List.getDefaultInstance())
+          return this;
+        if (other.hasIGNORE()) {
+          setIGNORE(other.getIGNORE());
+        }
+        if (other.hasCOUNT()) {
+          setCOUNT(other.getCOUNT());
+        }
+        if (!other.uID_.isEmpty()) {
+          if (result.uID_.isEmpty()) {
+            result.uID_ = new java.util.ArrayList<java.lang.String>();
+          }
+          result.uID_.addAll(other.uID_);
+        }
+        this.mergeUnknownFields(other.getUnknownFields());
+        return this;
+      }
+      
+      public Builder mergeFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+          throws java.io.IOException {
+        com.google.protobuf.UnknownFieldSet.Builder unknownFields = com.google.protobuf.UnknownFieldSet.newBuilder(this.getUnknownFields());
+        while (true) {
+          int tag = input.readTag();
+          switch (tag) {
+            case 0:
+              this.setUnknownFields(unknownFields.build());
+              return this;
+            default: {
+              if (!parseUnknownField(input, unknownFields, extensionRegistry, tag)) {
+                this.setUnknownFields(unknownFields.build());
+                return this;
+              }
+              break;
+            }
+            case 8: {
+              setIGNORE(input.readBool());
+              break;
+            }
+            case 16: {
+              setCOUNT(input.readUInt64());
+              break;
+            }
+            case 26: {
+              addUID(input.readString());
+              break;
+            }
+          }
+        }
+      }
+      
+      // required bool IGNORE = 1;
+      public boolean hasIGNORE() {
+        return result.hasIGNORE();
+      }
+      
+      public boolean getIGNORE() {
+        return result.getIGNORE();
+      }
+      
+      public Builder setIGNORE(boolean value) {
+        result.hasIGNORE = true;
+        result.iGNORE_ = value;
+        return this;
+      }
+      
+      public Builder clearIGNORE() {
+        result.hasIGNORE = false;
+        result.iGNORE_ = false;
+        return this;
+      }
+      
+      // required uint64 COUNT = 2;
+      public boolean hasCOUNT() {
+        return result.hasCOUNT();
+      }
+      
+      public long getCOUNT() {
+        return result.getCOUNT();
+      }
+      
+      public Builder setCOUNT(long value) {
+        result.hasCOUNT = true;
+        result.cOUNT_ = value;
+        return this;
+      }
+      
+      public Builder clearCOUNT() {
+        result.hasCOUNT = false;
+        result.cOUNT_ = 0L;
+        return this;
+      }
+      
+      // repeated string UID = 3;
+      public java.util.List<java.lang.String> getUIDList() {
+        return java.util.Collections.unmodifiableList(result.uID_);
+      }
+      
+      public int getUIDCount() {
+        return result.getUIDCount();
+      }
+      
+      public java.lang.String getUID(int index) {
+        return result.getUID(index);
+      }
+      
+      public Builder setUID(int index, java.lang.String value) {
+        if (value == null) {
+          throw new NullPointerException();
+        }
+        result.uID_.set(index, value);
+        return this;
+      }
+      
+      public Builder addUID(java.lang.String value) {
+        if (value == null) {
+          throw new NullPointerException();
+        }
+        if (result.uID_.isEmpty()) {
+          result.uID_ = new java.util.ArrayList<java.lang.String>();
+        }
+        result.uID_.add(value);
+        return this;
+      }
+      
+      public Builder addAllUID(java.lang.Iterable<? extends java.lang.String> values) {
+        if (result.uID_.isEmpty()) {
+          result.uID_ = new java.util.ArrayList<java.lang.String>();
+        }
+        super.addAll(values, result.uID_);
+        return this;
+      }
+      
+      public Builder clearUID() {
+        result.uID_ = java.util.Collections.emptyList();
+        return this;
+      }
+      
+      // @@protoc_insertion_point(builder_scope:protobuf.List)
+    }
+    
+    static {
+      defaultInstance = new List(true);
+      org.apache.accumulo.examples.wikisearch.protobuf.Uid.internalForceInit();
+      defaultInstance.initFields();
+    }
+    
+    // @@protoc_insertion_point(class_scope:protobuf.List)
+  }
+  
+  private static com.google.protobuf.Descriptors.Descriptor internal_static_protobuf_List_descriptor;
+  private static com.google.protobuf.GeneratedMessage.FieldAccessorTable internal_static_protobuf_List_fieldAccessorTable;
+  
+  public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
+    return descriptor;
+  }
+  
+  private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
+  static {
+    java.lang.String[] descriptorData = {"\n\tUid.proto\022\010protobuf\"2\n\004List\022\016\n\006IGNORE\030"
+        + "\001 \002(\010\022\r\n\005COUNT\030\002 \002(\004\022\013\n\003UID\030\003 \003(\tB\014\n\010pro" + "tobufH\001"};
+    com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner = new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() {
+      public com.google.protobuf.ExtensionRegistry assignDescriptors(com.google.protobuf.Descriptors.FileDescriptor root) {
+        descriptor = root;
+        internal_static_protobuf_List_descriptor = getDescriptor().getMessageTypes().get(0);
+        internal_static_protobuf_List_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable(
+            internal_static_protobuf_List_descriptor, new java.lang.String[] {"IGNORE", "COUNT", "UID",}, org.apache.accumulo.examples.wikisearch.protobuf.Uid.List.class,
+            org.apache.accumulo.examples.wikisearch.protobuf.Uid.List.Builder.class);
+        return null;
+      }
+    };
+    com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[] {},
+        assigner);
+  }
+  
+  public static void internalForceInit() {}
+  
+  // @@protoc_insertion_point(outer_class_scope)
+}

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/Uid.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/AggregatingRecordReader.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/AggregatingRecordReader.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/AggregatingRecordReader.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/AggregatingRecordReader.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.reader;
+
+
+import java.io.IOException;
+
+import org.apache.accumulo.examples.wikisearch.ingest.WikipediaConfiguration;
+import org.apache.accumulo.examples.wikisearch.util.TextUtil;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+
+
+/**
+ * This class aggregates Text values based on a start and end filter. An example use case for this would be XML data. This will not work with data that has
+ * nested start and stop tokens.
+ * 
+ */
+public class AggregatingRecordReader extends LongLineRecordReader {
+  
+  public static final String START_TOKEN = "aggregating.token.start";
+  public static final String END_TOKEN = "aggregating.token.end";
+  public static final String RETURN_PARTIAL_MATCHES = "aggregating.allow.partial";
+  
+  private LongWritable key = new LongWritable();
+  private String startToken = null;
+  private String endToken = null;
+  private long counter = 0;
+  private Text aggValue = new Text();
+  private boolean startFound = false;
+  private StringBuilder remainder = new StringBuilder(0);
+  private boolean returnPartialMatches = false;
+  
+  @Override
+  public LongWritable getCurrentKey() {
+    key.set(counter);
+    return key;
+  }
+  
+  @Override
+  public Text getCurrentValue() {
+    return aggValue;
+  }
+  
+  @Override
+  public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
+    super.initialize(genericSplit, context);
+    this.startToken = WikipediaConfiguration.isNull(context.getConfiguration(), START_TOKEN, String.class);
+    this.endToken = WikipediaConfiguration.isNull(context.getConfiguration(), END_TOKEN, String.class);
+    this.returnPartialMatches = context.getConfiguration().getBoolean(RETURN_PARTIAL_MATCHES, false);
+    
+    /*
+     * Text-appending works almost exactly like the + operator on Strings- it creates a byte array exactly the size of [prefix + suffix] and dumps the bytes
+     * into the new array. This module works by doing lots of little additions, one line at a time. With most XML, the documents are partitioned on line
+     * boundaries, so we will generally have lots of additions. Setting a large default byte array for a text object can avoid this and give us
+     * StringBuilder-like functionality for Text objects.
+     */
+    byte[] txtBuffer = new byte[2048];
+    aggValue.set(txtBuffer);
+  }
+  
+  @Override
+  public boolean nextKeyValue() throws IOException {
+    aggValue.clear();
+    boolean hasNext = false;
+    boolean finished = false;
+    // Find the start token
+    while (!finished && (((hasNext = super.nextKeyValue()) == true) || remainder.length() > 0)) {
+      if (hasNext)
+        finished = process(super.getCurrentValue());
+      else
+        finished = process(null);
+      if (finished) {
+        startFound = false;
+        counter++;
+        return true;
+      }
+    }
+    // If we have anything loaded in the agg value (and we found a start)
+    // then we ran out of data before finding the end. Just return the
+    // data we have and if it's not valid, downstream parsing of the data
+    // will fail.
+    if (returnPartialMatches && startFound && aggValue.getLength() > 0) {
+      startFound = false;
+      counter++;
+      return true;
+    }
+    return false;
+  }
+  
+  /**
+   * Populates aggValue with the contents of the Text object.
+   * 
+   * @param t
+   * @return true if aggValue is complete, else false and needs more data.
+   */
+  private boolean process(Text t) {
+    
+    if (null != t)
+      remainder.append(t.toString());
+    while (remainder.length() > 0) {
+      if (!startFound) {
+        // If found, then begin aggregating at the start offset
+        int start = remainder.indexOf(startToken);
+        if (-1 != start) {
+          // Append the start token to the aggregate value
+          TextUtil.textAppendNoNull(aggValue, remainder.substring(start, start + startToken.length()), false);
+          // Remove to the end of the start token from the remainder
+          remainder.delete(0, start + startToken.length());
+          startFound = true;
+        } else {
+          // If we are looking for the start and have not found it, then remove
+          // the bytes
+          remainder.delete(0, remainder.length());
+        }
+      } else {
+        // Try to find the end
+        int end = remainder.indexOf(endToken);
+        // Also try to find the start
+        int start = remainder.indexOf(startToken);
+        if (-1 == end) {
+          if (returnPartialMatches && start >= 0) {
+            // End token not found, but another start token was found...
+            // The amount to copy is up to the beginning of the next start token
+            TextUtil.textAppendNoNull(aggValue, remainder.substring(0, start), false);
+            remainder.delete(0, start);
+            return true;
+          } else {
+            // Not found, aggregate the entire remainder
+            TextUtil.textAppendNoNull(aggValue, remainder.toString(), false);
+            // Delete all chars from remainder
+            remainder.delete(0, remainder.length());
+          }
+        } else {
+          if (returnPartialMatches && start >= 0 && start < end) {
+            // We found the end token, but found another start token first, so
+            // deal with that.
+            TextUtil.textAppendNoNull(aggValue, remainder.substring(0, start), false);
+            remainder.delete(0, start);
+            return true;
+          } else {
+            // END_TOKEN was found. Extract to the end of END_TOKEN
+            TextUtil.textAppendNoNull(aggValue, remainder.substring(0, end + endToken.length()), false);
+            // Remove from remainder up to the end of END_TOKEN
+            remainder.delete(0, end + endToken.length());
+            return true;
+          }
+        }
+      }
+    }
+    return false;
+  }
+  
+}

Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/AggregatingRecordReader.java
------------------------------------------------------------------------------
    svn:eol-style = native