You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@accumulo.apache.org by ec...@apache.org on 2012/01/12 17:06:20 UTC
svn commit: r1230608 [5/16] - in /incubator/accumulo/trunk: ./
contrib/accumulo_sample/ src/assemble/ src/core/
src/core/src/main/java/org/apache/accumulo/core/client/impl/thrift/
src/core/src/main/java/org/apache/accumulo/core/master/thrift/ src/core/...
Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaIngester.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaIngester.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaIngester.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaIngester.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,211 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.ingest;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.accumulo.core.client.AccumuloException;
+import org.apache.accumulo.core.client.AccumuloSecurityException;
+import org.apache.accumulo.core.client.Connector;
+import org.apache.accumulo.core.client.TableExistsException;
+import org.apache.accumulo.core.client.TableNotFoundException;
+import org.apache.accumulo.core.client.admin.TableOperations;
+import org.apache.accumulo.core.client.mapreduce.AccumuloOutputFormat;
+import org.apache.accumulo.core.conf.Property;
+import org.apache.accumulo.core.data.Mutation;
+import org.apache.accumulo.core.iterators.IteratorUtil.IteratorScope;
+import org.apache.accumulo.core.iterators.aggregation.NumSummation;
+import org.apache.accumulo.core.iterators.aggregation.conf.AggregatorConfiguration;
+import org.apache.accumulo.examples.wikisearch.reader.AggregatingRecordReader;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.PathFilter;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
+import org.apache.hadoop.util.Tool;
+import org.apache.hadoop.util.ToolRunner;
+
+
+@SuppressWarnings("deprecation")
+public class WikipediaIngester extends Configured implements Tool {
+
+ public final static String INGEST_LANGUAGE = "wikipedia.ingest_language";
+ public final static String SPLIT_FILE = "wikipedia.split_file";
+ public final static String TABLE_NAME = "wikipedia.table";
+
+ public static void main(String[] args) throws Exception {
+ int res = ToolRunner.run(new Configuration(), new WikipediaIngester(), args);
+ System.exit(res);
+ }
+
+ private void createTables(TableOperations tops, String tableName) throws AccumuloException, AccumuloSecurityException, TableNotFoundException,
+ TableExistsException {
+ // Create the shard table
+ String indexTableName = tableName + "Index";
+ String reverseIndexTableName = tableName + "ReverseIndex";
+ String metadataTableName = tableName + "Metadata";
+
+ // create the shard table
+ if (!tops.exists(tableName)) {
+ // Set a text index aggregator on the given field names. No aggregator is set if the option is not supplied
+ String textIndexFamilies = WikipediaMapper.TOKENS_FIELD_NAME;
+
+ if (textIndexFamilies.length() > 0) {
+ System.out.println("Adding content aggregator on the fields: " + textIndexFamilies);
+
+ // Create and set the aggregators in one shot
+ List<AggregatorConfiguration> aggregators = new ArrayList<AggregatorConfiguration>();
+
+ for (String family : StringUtils.split(textIndexFamilies, ',')) {
+ aggregators.add(new AggregatorConfiguration(new Text("fi\0" + family), org.apache.accumulo.examples.wikisearch.aggregator.TextIndexAggregator.class.getName()));
+ }
+
+ tops.create(tableName);
+ tops.addAggregators(tableName, aggregators);
+ } else {
+ tops.create(tableName);
+ }
+
+ // Set the locality group for the full content column family
+ tops.setLocalityGroups(tableName, Collections.singletonMap("WikipediaDocuments", Collections.singleton(new Text(WikipediaMapper.DOCUMENT_COLUMN_FAMILY))));
+
+ }
+
+ if (!tops.exists(indexTableName)) {
+ tops.create(indexTableName);
+ // Add the UID aggregator
+ for (IteratorScope scope : IteratorScope.values()) {
+ String stem = String.format("%s%s.%s", Property.TABLE_ITERATOR_PREFIX, scope.name(), "UIDAggregator");
+ tops.setProperty(indexTableName, stem, "19,org.apache.accumulo.examples.wikisearch.iterator.TotalAggregatingIterator");
+ stem += ".opt.";
+ tops.setProperty(indexTableName, stem + "*", "org.apache.accumulo.examples.wikisearch.aggregator.GlobalIndexUidAggregator");
+
+ }
+ }
+
+ if (!tops.exists(reverseIndexTableName)) {
+ tops.create(reverseIndexTableName);
+ // Add the UID aggregator
+ for (IteratorScope scope : IteratorScope.values()) {
+ String stem = String.format("%s%s.%s", Property.TABLE_ITERATOR_PREFIX, scope.name(), "UIDAggregator");
+ tops.setProperty(reverseIndexTableName, stem, "19,org.apache.accumulo.examples.wikisearch.iterator.TotalAggregatingIterator");
+ stem += ".opt.";
+ tops.setProperty(reverseIndexTableName, stem + "*", "org.apache.accumulo.examples.wikisearch.aggregator.GlobalIndexUidAggregator");
+
+ }
+ }
+
+ if (!tops.exists(metadataTableName)) {
+ // Add the NumSummation aggregator for the frequency column
+ List<AggregatorConfiguration> aggregators = new ArrayList<AggregatorConfiguration>();
+ aggregators.add(new AggregatorConfiguration(new Text("f"), NumSummation.class.getName()));
+ tops.create(metadataTableName);
+ tops.addAggregators(metadataTableName, aggregators);
+ }
+ }
+
+ @Override
+ public int run(String[] args) throws Exception {
+ Job job = new Job(getConf(), "Ingest Wikipedia");
+ Configuration conf = job.getConfiguration();
+ conf.set("mapred.map.tasks.speculative.execution", "false");
+
+ String tablename = WikipediaConfiguration.getTableName(conf);
+
+ String zookeepers = WikipediaConfiguration.getZookeepers(conf);
+ String instanceName = WikipediaConfiguration.getInstanceName(conf);
+
+ String user = WikipediaConfiguration.getUser(conf);
+ byte[] password = WikipediaConfiguration.getPassword(conf);
+ Connector connector = WikipediaConfiguration.getConnector(conf);
+
+ TableOperations tops = connector.tableOperations();
+
+ createTables(tops, tablename);
+
+ configureJob(job);
+
+ List<Path> inputPaths = new ArrayList<Path>();
+ SortedSet<String> languages = new TreeSet<String>();
+ FileSystem fs = FileSystem.get(conf);
+ Path parent = new Path(conf.get("wikipedia.input"));
+ listFiles(parent, fs, inputPaths, languages);
+
+ System.out.println("Input files in " + parent + ":" + inputPaths.size());
+ Path[] inputPathsArray = new Path[inputPaths.size()];
+ inputPaths.toArray(inputPathsArray);
+
+ System.out.println("Languages:" + languages.size());
+
+ FileInputFormat.setInputPaths(job, inputPathsArray);
+
+ job.setMapperClass(WikipediaMapper.class);
+ job.setNumReduceTasks(0);
+ job.setMapOutputKeyClass(Text.class);
+ job.setMapOutputValueClass(Mutation.class);
+ job.setOutputFormatClass(AccumuloOutputFormat.class);
+ AccumuloOutputFormat.setOutputInfo(job, user, password, true, tablename);
+ AccumuloOutputFormat.setZooKeeperInstance(job, instanceName, zookeepers);
+
+ return job.waitForCompletion(true) ? 0 : 1;
+ }
+
+ public final static PathFilter partFilter = new PathFilter() {
+ @Override
+ public boolean accept(Path path) {
+ return path.getName().startsWith("part");
+ };
+ };
+
+ protected void configureJob(Job job) {
+ Configuration conf = job.getConfiguration();
+ job.setJarByClass(WikipediaIngester.class);
+ job.setInputFormatClass(WikipediaInputFormat.class);
+ conf.set(AggregatingRecordReader.START_TOKEN, "<page>");
+ conf.set(AggregatingRecordReader.END_TOKEN, "</page>");
+ }
+
+ protected static final Pattern filePattern = Pattern.compile("([a-z_]+).*.xml(.bz2)?");
+
+ protected void listFiles(Path path, FileSystem fs, List<Path> files, Set<String> languages) throws IOException {
+ for (FileStatus status : fs.listStatus(path)) {
+ if (status.isDir()) {
+ listFiles(status.getPath(), fs, files, languages);
+ } else {
+ Path p = status.getPath();
+ Matcher matcher = filePattern.matcher(p.getName());
+ if (matcher.matches()) {
+ languages.add(matcher.group(1));
+ files.add(p);
+ }
+ }
+ }
+ }
+}
Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaIngester.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputFormat.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputFormat.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputFormat.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputFormat.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.ingest;
+
+import org.apache.accumulo.examples.wikisearch.reader.AggregatingRecordReader;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
+
+
+public class WikipediaInputFormat extends TextInputFormat {
+
+ @Override
+ public RecordReader<LongWritable,Text> createRecordReader(InputSplit split, TaskAttemptContext context) {
+ return new AggregatingRecordReader();
+ }
+
+ @Override
+ protected boolean isSplitable(JobContext context, Path file) {
+ return false;
+ }
+
+}
Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputFormat.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,258 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ *
+ */
+package org.apache.accumulo.examples.wikisearch.ingest;
+
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.StringReader;
+import java.nio.charset.Charset;
+import java.util.HashSet;
+import java.util.IllegalFormatException;
+import java.util.Map.Entry;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+
+import org.apache.accumulo.core.data.Mutation;
+import org.apache.accumulo.core.data.Value;
+import org.apache.accumulo.core.security.ColumnVisibility;
+import org.apache.accumulo.examples.wikisearch.ingest.ArticleExtractor.Article;
+import org.apache.accumulo.examples.wikisearch.normalizer.LcNoDiacriticsNormalizer;
+import org.apache.accumulo.examples.wikisearch.protobuf.Uid;
+import org.apache.accumulo.examples.wikisearch.protobuf.Uid.List.Builder;
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.lang.StringUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Mapper;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.log4j.Logger;
+import org.apache.lucene.analysis.StopAnalyzer;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.ar.ArabicAnalyzer;
+import org.apache.lucene.analysis.br.BrazilianAnalyzer;
+import org.apache.lucene.analysis.cjk.CJKAnalyzer;
+import org.apache.lucene.analysis.de.GermanAnalyzer;
+import org.apache.lucene.analysis.el.GreekAnalyzer;
+import org.apache.lucene.analysis.fa.PersianAnalyzer;
+import org.apache.lucene.analysis.fr.FrenchAnalyzer;
+import org.apache.lucene.analysis.nl.DutchAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.wikipedia.analysis.WikipediaTokenizer;
+
+
+import com.google.common.collect.HashMultimap;
+import com.google.common.collect.Multimap;
+
+public class WikipediaMapper extends Mapper<LongWritable,Text,Text,Mutation> {
+
+ private static final Logger log = Logger.getLogger(WikipediaMapper.class);
+
+ public final static Charset UTF8 = Charset.forName("UTF-8");
+ public static final String DOCUMENT_COLUMN_FAMILY = "d";
+ public static final String METADATA_EVENT_COLUMN_FAMILY = "e";
+ public static final String METADATA_INDEX_COLUMN_FAMILY = "i";
+ public static final String TOKENS_FIELD_NAME = "TEXT";
+
+ private final static Pattern languagePattern = Pattern.compile("([a-z_]+).*.xml(.bz2)?");
+ private static final Value NULL_VALUE = new Value(new byte[0]);
+ private static final String cvPrefix = "all|";
+
+ private ArticleExtractor extractor;
+ private String language;
+ private int numPartitions = 0;
+ private Set<?> stopwords = null;
+ private ColumnVisibility cv = null;
+
+ private Text tablename = null;
+ private Text indexTableName = null;
+ private Text reverseIndexTableName = null;
+ private Text metadataTableName = null;
+
+ @Override
+ public void setup(Context context) {
+ Configuration conf = context.getConfiguration();
+ tablename = new Text(WikipediaConfiguration.getTableName(conf));
+ indexTableName = new Text(tablename + "Index");
+ reverseIndexTableName = new Text(tablename + "ReverseIndex");
+ metadataTableName = new Text(tablename + "Metadata");
+
+ FileSplit split = (FileSplit) context.getInputSplit();
+ String fileName = split.getPath().getName();
+ Matcher matcher = languagePattern.matcher(fileName);
+ if (matcher.matches()) {
+ language = matcher.group(1).replace('_', '-').toLowerCase();
+ if (language.equals("arwiki"))
+ stopwords = ArabicAnalyzer.getDefaultStopSet();
+ else if (language.equals("brwiki"))
+ stopwords = BrazilianAnalyzer.getDefaultStopSet();
+ else if (language.startsWith("zh"))
+ stopwords = CJKAnalyzer.getDefaultStopSet();
+ else if (language.equals("dewiki"))
+ stopwords = GermanAnalyzer.getDefaultStopSet();
+ else if (language.equals("elwiki"))
+ stopwords = GreekAnalyzer.getDefaultStopSet();
+ else if (language.equals("fawiki"))
+ stopwords = PersianAnalyzer.getDefaultStopSet();
+ else if (language.equals("frwiki"))
+ stopwords = FrenchAnalyzer.getDefaultStopSet();
+ else if (language.equals("nlwiki"))
+ stopwords = DutchAnalyzer.getDefaultStopSet();
+ else
+ stopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
+
+ } else {
+ throw new RuntimeException("Unknown ingest language! " + fileName);
+ }
+ extractor = new ArticleExtractor();
+ numPartitions = WikipediaConfiguration.getNumPartitions(conf);
+ cv = new ColumnVisibility(cvPrefix + language);
+
+ }
+
+ /**
+ * We will partition the documents based on the document id
+ *
+ * @param article
+ * @param numPartitions
+ * @return
+ * @throws IllegalFormatException
+ */
+ public static int getPartitionId(Article article, int numPartitions) throws IllegalFormatException {
+ return article.getId() % numPartitions;
+ }
+
+ @Override
+ protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
+ Article article = extractor.extract(new InputStreamReader(new ByteArrayInputStream(value.getBytes()), UTF8));
+ String NULL_BYTE = "\u0000";
+ String colfPrefix = language + NULL_BYTE;
+ String indexPrefix = "fi" + NULL_BYTE;
+ if (article != null) {
+ Text partitionId = new Text(Integer.toString(WikipediaMapper.getPartitionId(article, numPartitions)));
+
+ // Create the mutations for the document.
+ // Row is partition id, colf is language0articleid, colq is fieldName\0fieldValue
+ Mutation m = new Mutation(partitionId);
+ for (Entry<String,Object> entry : article.getFieldValues().entrySet()) {
+ m.put(colfPrefix + article.getId(), entry.getKey() + NULL_BYTE + entry.getValue().toString(), cv, article.getTimestamp(), NULL_VALUE);
+ // Create mutations for the metadata table.
+ Mutation mm = new Mutation(entry.getKey());
+ mm.put(METADATA_EVENT_COLUMN_FAMILY, language, cv, article.getTimestamp(), NULL_VALUE);
+ context.write(metadataTableName, mm);
+ }
+
+ // Tokenize the content
+ Set<String> tokens = getTokens(article);
+
+ // We are going to put the fields to be indexed into a multimap. This allows us to iterate
+ // over the entire set once.
+ Multimap<String,String> indexFields = HashMultimap.create();
+ // Add the normalized field values
+ LcNoDiacriticsNormalizer normalizer = new LcNoDiacriticsNormalizer();
+ for (Entry<String,String> index : article.getNormalizedFieldValues().entrySet())
+ indexFields.put(index.getKey(), index.getValue());
+ // Add the tokens
+ for (String token : tokens)
+ indexFields.put(TOKENS_FIELD_NAME, normalizer.normalizeFieldValue("", token));
+
+ for (Entry<String,String> index : indexFields.entries()) {
+ // Create mutations for the in partition index
+ // Row is partition id, colf is 'fi'\0fieldName, colq is fieldValue\0language\0article id
+ m.put(indexPrefix + index.getKey(), index.getValue() + NULL_BYTE + colfPrefix + article.getId(), cv, article.getTimestamp(), NULL_VALUE);
+
+ // Create mutations for the global index
+ // Create a UID object for the Value
+ Builder uidBuilder = Uid.List.newBuilder();
+ uidBuilder.setIGNORE(false);
+ uidBuilder.setCOUNT(1);
+ uidBuilder.addUID(Integer.toString(article.getId()));
+ Uid.List uidList = uidBuilder.build();
+ Value val = new Value(uidList.toByteArray());
+
+ // Create mutations for the global index
+ // Row is field value, colf is field name, colq is partitionid\0language, value is Uid.List object
+ Mutation gm = new Mutation(index.getValue());
+ gm.put(index.getKey(), partitionId + NULL_BYTE + language, cv, article.getTimestamp(), val);
+ context.write(indexTableName, gm);
+
+ // Create mutations for the global reverse index
+ Mutation grm = new Mutation(StringUtils.reverse(index.getValue()));
+ grm.put(index.getKey(), partitionId + NULL_BYTE + language, cv, article.getTimestamp(), val);
+ context.write(reverseIndexTableName, grm);
+
+ // Create mutations for the metadata table.
+ Mutation mm = new Mutation(index.getKey());
+ mm.put(METADATA_INDEX_COLUMN_FAMILY, language + NULL_BYTE + LcNoDiacriticsNormalizer.class.getName(), cv, article.getTimestamp(), NULL_VALUE);
+ context.write(metadataTableName, mm);
+
+ }
+ // Add the entire text to the document section of the table.
+ // row is the partition, colf is 'd', colq is language\0articleid, value is Base64 encoded GZIP'd document
+ m.put(DOCUMENT_COLUMN_FAMILY, colfPrefix + article.getId(), cv, article.getTimestamp(), new Value(Base64.encodeBase64(article.getText().getBytes())));
+ context.write(tablename, m);
+
+ } else {
+ context.getCounter("wikipedia", "invalid articles").increment(1);
+ }
+ context.progress();
+ }
+
+ /**
+ * Tokenize the wikipedia content
+ *
+ * @param article
+ * @return
+ * @throws IOException
+ */
+ private Set<String> getTokens(Article article) throws IOException {
+ Set<String> tokenList = new HashSet<String>();
+ WikipediaTokenizer tok = new WikipediaTokenizer(new StringReader(article.getText()));
+ TermAttribute term = tok.addAttribute(TermAttribute.class);
+ StopFilter filter = new StopFilter(false, tok, stopwords, true);
+ try {
+ while (filter.incrementToken()) {
+ String token = term.term();
+ if (!StringUtils.isEmpty(token))
+ tokenList.add(token);
+ }
+ } catch (IOException e) {
+ log.error("Error tokenizing text", e);
+ } finally {
+ try {
+ tok.end();
+ } catch (IOException e) {
+ log.error("Error calling end()", e);
+ } finally {
+ try {
+ tok.close();
+ } catch (IOException e) {
+ log.error("Error closing tokenizer", e);
+ }
+ }
+ }
+ return tokenList;
+ }
+
+}
Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/TotalAggregatingIterator.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/TotalAggregatingIterator.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/TotalAggregatingIterator.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/TotalAggregatingIterator.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.iterator;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.accumulo.core.data.ByteSequence;
+import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.PartialKey;
+import org.apache.accumulo.core.data.Range;
+import org.apache.accumulo.core.data.Value;
+import org.apache.accumulo.core.iterators.IteratorEnvironment;
+import org.apache.accumulo.core.iterators.OptionDescriber;
+import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
+import org.apache.accumulo.core.iterators.aggregation.Aggregator;
+import org.apache.accumulo.core.iterators.conf.ColumnToClassMapping;
+import org.apache.accumulo.start.classloader.AccumuloClassLoader;
+
+/**
+ * Aggregate all values with the same key (row, colf, colq, colVis.).
+ *
+ */
+
+public class TotalAggregatingIterator implements SortedKeyValueIterator<Key,Value>, OptionDescriber {
+
+ private SortedKeyValueIterator<Key,Value> iterator;
+
+ private Key workKey = new Key();
+
+ private Key aggrKey;
+ private Value aggrValue;
+
+ private Aggregator agg;
+
+ public TotalAggregatingIterator deepCopy(IteratorEnvironment env) {
+ return new TotalAggregatingIterator(this, env);
+ }
+
+ private TotalAggregatingIterator(TotalAggregatingIterator other, IteratorEnvironment env) {
+ iterator = other.iterator.deepCopy(env);
+ agg = other.agg;
+ }
+
+ public TotalAggregatingIterator() {}
+
+ private void aggregateRowColumn(Aggregator aggr) throws IOException {
+ // this function assumes that first value is not delete
+
+ workKey.set(iterator.getTopKey());
+
+ Key keyToAggregate = workKey;
+
+ aggr.reset();
+
+ aggr.collect(iterator.getTopValue());
+ iterator.next();
+
+ while (iterator.hasTop() && iterator.getTopKey().equals(keyToAggregate, PartialKey.ROW_COLFAM_COLQUAL_COLVIS)) {
+ aggr.collect(iterator.getTopValue());
+ iterator.next();
+ }
+
+ aggrKey = workKey;
+ aggrValue = aggr.aggregate();
+
+ }
+
+ private void findTop() throws IOException {
+ // check if aggregation is needed
+ if (iterator.hasTop()) {
+ aggregateRowColumn(agg);
+ }
+ }
+
+ public TotalAggregatingIterator(SortedKeyValueIterator<Key,Value> iterator, ColumnToClassMapping<Aggregator> aggregators) throws IOException {
+ this.iterator = iterator;
+ }
+
+ @Override
+ public Key getTopKey() {
+ if (aggrKey != null) {
+ return aggrKey;
+ }
+ return iterator.getTopKey();
+ }
+
+ @Override
+ public Value getTopValue() {
+ if (aggrKey != null) {
+ return aggrValue;
+ }
+ return iterator.getTopValue();
+ }
+
+ @Override
+ public boolean hasTop() {
+ return aggrKey != null || iterator.hasTop();
+ }
+
+ @Override
+ public void next() throws IOException {
+ if (aggrKey != null) {
+ aggrKey = null;
+ aggrValue = null;
+ } else {
+ iterator.next();
+ }
+
+ findTop();
+ }
+
+ @Override
+ public void seek(Range range, Collection<ByteSequence> columnFamilies, boolean inclusive) throws IOException {
+ // do not want to seek to the middle of a value that should be
+ // aggregated...
+
+ Range seekRange = maximizeStartKeyTimeStamp(range);
+
+ iterator.seek(seekRange, columnFamilies, inclusive);
+ findTop();
+
+ if (range.getStartKey() != null) {
+ while (hasTop() && getTopKey().equals(range.getStartKey(), PartialKey.ROW_COLFAM_COLQUAL_COLVIS)
+ && getTopKey().getTimestamp() > range.getStartKey().getTimestamp()) {
+ // the value has a more recent time stamp, so
+ // pass it up
+ // log.debug("skipping "+getTopKey());
+ next();
+ }
+
+ while (hasTop() && range.beforeStartKey(getTopKey())) {
+ next();
+ }
+ }
+
+ }
+
+ @Override
+ public void init(SortedKeyValueIterator<Key,Value> source, Map<String,String> options, IteratorEnvironment env) throws IOException {
+ agg = createAggregator(options);
+ this.iterator = source;
+ }
+
+ @Override
+ public IteratorOptions describeOptions() {
+ return new IteratorOptions("agg", "Aggregators apply aggregating functions to values with identical keys", null,
+ Collections.singletonList("* <aggregatorClass>"));
+ }
+
+ @Override
+ public boolean validateOptions(Map<String,String> options) {
+ if (options.size() > 1)
+ throw new IllegalArgumentException("This iterator only accepts one configuration option, the name of the aggregating class");
+ agg = createAggregator(options);
+ return true;
+ }
+
+ private Aggregator createAggregator(Map<String,String> options) {
+ Aggregator a = null;
+ for (Entry<String,String> entry : options.entrySet()) {
+ try {
+ Class<? extends Aggregator> clazz = AccumuloClassLoader.loadClass(entry.getValue(), Aggregator.class);
+ a = clazz.newInstance();
+ } catch (ClassNotFoundException e) {
+ throw new IllegalArgumentException("class not found: " + entry.getValue());
+ } catch (InstantiationException e) {
+ throw new IllegalArgumentException("instantiation exception: " + entry.getValue());
+ } catch (IllegalAccessException e) {
+ throw new IllegalArgumentException("illegal access exception: " + entry.getValue());
+ }
+ }
+ return a;
+ }
+
+ static Range maximizeStartKeyTimeStamp(Range range) {
+ Range seekRange = range;
+
+ if (range.getStartKey() != null && range.getStartKey().getTimestamp() != Long.MAX_VALUE) {
+ Key seekKey = new Key(seekRange.getStartKey());
+ seekKey.setTimestamp(Long.MAX_VALUE);
+ seekRange = new Range(seekKey, true, range.getEndKey(), range.isEndKeyInclusive());
+ }
+
+ return seekRange;
+ }
+}
Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/TotalAggregatingIterator.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/LcNoDiacriticsNormalizer.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/LcNoDiacriticsNormalizer.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/LcNoDiacriticsNormalizer.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/LcNoDiacriticsNormalizer.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.normalizer;
+
+import java.text.Normalizer;
+import java.text.Normalizer.Form;
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * An {@link Normalizer} which performs the following steps:
+ * <ol>
+ * <li>Unicode canonical decomposition ({@link Form#NFD})</li>
+ * <li>Removal of diacritical marks</li>
+ * <li>Unicode canonical composition ({@link Form#NFC})</li>
+ * <li>lower casing in the {@link Locale#ENGLISH English local}
+ * </ol>
+ */
+public class LcNoDiacriticsNormalizer implements org.apache.accumulo.examples.wikisearch.normalizer.Normalizer {
+ private static final Pattern diacriticals = Pattern.compile("\\p{InCombiningDiacriticalMarks}");
+
+ public String normalizeFieldValue(String fieldName, Object fieldValue) {
+ String decomposed = Normalizer.normalize(fieldValue.toString(), Form.NFD);
+ String noDiacriticals = removeDiacriticalMarks(decomposed);
+ String recomposed = Normalizer.normalize(noDiacriticals, Form.NFC);
+ return recomposed.toLowerCase(Locale.ENGLISH);
+ }
+
+ private String removeDiacriticalMarks(String str) {
+ Matcher matcher = diacriticals.matcher(str);
+ return matcher.replaceAll("");
+ }
+
+}
Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/LcNoDiacriticsNormalizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NoOpNormalizer.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NoOpNormalizer.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NoOpNormalizer.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NoOpNormalizer.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.normalizer;
+
+public class NoOpNormalizer implements Normalizer {
+ public String normalizeFieldValue(String field, Object value) {
+ return value.toString();
+ }
+}
Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NoOpNormalizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/Normalizer.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/Normalizer.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/Normalizer.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/Normalizer.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.normalizer;
+
+public interface Normalizer {
+
+ /**
+ * Creates normalized content for ingest based upon implemented logic.
+ *
+ * @param field
+ * The field being normalized
+ * @param value
+ * The value to normalize
+ * @return a normalized value
+ */
+ public String normalizeFieldValue(String field, Object value);
+
+}
Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/Normalizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.normalizer;
+
+import org.apache.commons.lang.math.NumberUtils;
+import org.apache.lucene.util.NumericUtils;
+
+public class NumberNormalizer implements Normalizer {
+
+ public String normalizeFieldValue(String field, Object value) {
+ if (NumberUtils.isNumber(value.toString())) {
+ Number n = NumberUtils.createNumber(value.toString());
+ if (n instanceof Integer)
+ return NumericUtils.intToPrefixCoded((Integer) n);
+ else if (n instanceof Long)
+ return NumericUtils.longToPrefixCoded((Long) n);
+ else if (n instanceof Float)
+ return NumericUtils.floatToPrefixCoded((Float) n);
+ else if (n instanceof Double)
+ return NumericUtils.doubleToPrefixCoded((Double) n);
+ else
+ throw new IllegalArgumentException("Unhandled numeric type: " + n.getClass());
+ } else {
+ throw new IllegalArgumentException("Value is not a number: " + value);
+ }
+ }
+
+}
Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/TermWeight.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/TermWeight.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/TermWeight.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/TermWeight.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,424 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Generated by the protocol buffer compiler. DO NOT EDIT!
+// source: TermWeight.proto
+
+package org.apache.accumulo.examples.wikisearch.protobuf;
+
+public final class TermWeight {
+ private TermWeight() {}
+
+ public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {}
+
+ public static final class Info extends com.google.protobuf.GeneratedMessage {
+ // Use Info.newBuilder() to construct.
+ private Info() {
+ initFields();
+ }
+
+ private Info(boolean noInit) {}
+
+ private static final Info defaultInstance;
+
+ public static Info getDefaultInstance() {
+ return defaultInstance;
+ }
+
+ public Info getDefaultInstanceForType() {
+ return defaultInstance;
+ }
+
+ public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() {
+ return org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.internal_static_protobuf_Info_descriptor;
+ }
+
+ protected com.google.protobuf.GeneratedMessage.FieldAccessorTable internalGetFieldAccessorTable() {
+ return org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.internal_static_protobuf_Info_fieldAccessorTable;
+ }
+
+ // required float normalizedTermFrequency = 1;
+ public static final int NORMALIZEDTERMFREQUENCY_FIELD_NUMBER = 1;
+ private boolean hasNormalizedTermFrequency;
+ private float normalizedTermFrequency_ = 0F;
+
+ public boolean hasNormalizedTermFrequency() {
+ return hasNormalizedTermFrequency;
+ }
+
+ public float getNormalizedTermFrequency() {
+ return normalizedTermFrequency_;
+ }
+
+ // repeated uint32 wordOffset = 2;
+ public static final int WORDOFFSET_FIELD_NUMBER = 2;
+ private java.util.List<java.lang.Integer> wordOffset_ = java.util.Collections.emptyList();
+
+ public java.util.List<java.lang.Integer> getWordOffsetList() {
+ return wordOffset_;
+ }
+
+ public int getWordOffsetCount() {
+ return wordOffset_.size();
+ }
+
+ public int getWordOffset(int index) {
+ return wordOffset_.get(index);
+ }
+
+ private void initFields() {}
+
+ public final boolean isInitialized() {
+ if (!hasNormalizedTermFrequency)
+ return false;
+ return true;
+ }
+
+ public void writeTo(com.google.protobuf.CodedOutputStream output) throws java.io.IOException {
+ getSerializedSize();
+ if (hasNormalizedTermFrequency()) {
+ output.writeFloat(1, getNormalizedTermFrequency());
+ }
+ for (int element : getWordOffsetList()) {
+ output.writeUInt32(2, element);
+ }
+ getUnknownFields().writeTo(output);
+ }
+
+ private int memoizedSerializedSize = -1;
+
+ public int getSerializedSize() {
+ int size = memoizedSerializedSize;
+ if (size != -1)
+ return size;
+
+ size = 0;
+ if (hasNormalizedTermFrequency()) {
+ size += com.google.protobuf.CodedOutputStream.computeFloatSize(1, getNormalizedTermFrequency());
+ }
+ {
+ int dataSize = 0;
+ for (int element : getWordOffsetList()) {
+ dataSize += com.google.protobuf.CodedOutputStream.computeUInt32SizeNoTag(element);
+ }
+ size += dataSize;
+ size += 1 * getWordOffsetList().size();
+ }
+ size += getUnknownFields().getSerializedSize();
+ memoizedSerializedSize = size;
+ return size;
+ }
+
+ public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(com.google.protobuf.ByteString data) throws com.google.protobuf.InvalidProtocolBufferException {
+ return newBuilder().mergeFrom(data).buildParsed();
+ }
+
+ public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(com.google.protobuf.ByteString data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return newBuilder().mergeFrom(data, extensionRegistry).buildParsed();
+ }
+
+ public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(byte[] data) throws com.google.protobuf.InvalidProtocolBufferException {
+ return newBuilder().mergeFrom(data).buildParsed();
+ }
+
+ public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(byte[] data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return newBuilder().mergeFrom(data, extensionRegistry).buildParsed();
+ }
+
+ public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(java.io.InputStream input) throws java.io.IOException {
+ return newBuilder().mergeFrom(input).buildParsed();
+ }
+
+ public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ return newBuilder().mergeFrom(input, extensionRegistry).buildParsed();
+ }
+
+ public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseDelimitedFrom(java.io.InputStream input) throws java.io.IOException {
+ Builder builder = newBuilder();
+ if (builder.mergeDelimitedFrom(input)) {
+ return builder.buildParsed();
+ } else {
+ return null;
+ }
+ }
+
+ public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseDelimitedFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ Builder builder = newBuilder();
+ if (builder.mergeDelimitedFrom(input, extensionRegistry)) {
+ return builder.buildParsed();
+ } else {
+ return null;
+ }
+ }
+
+ public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(com.google.protobuf.CodedInputStream input) throws java.io.IOException {
+ return newBuilder().mergeFrom(input).buildParsed();
+ }
+
+ public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ return newBuilder().mergeFrom(input, extensionRegistry).buildParsed();
+ }
+
+ public static Builder newBuilder() {
+ return Builder.create();
+ }
+
+ public Builder newBuilderForType() {
+ return newBuilder();
+ }
+
+ public static Builder newBuilder(org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info prototype) {
+ return newBuilder().mergeFrom(prototype);
+ }
+
+ public Builder toBuilder() {
+ return newBuilder(this);
+ }
+
+ public static final class Builder extends com.google.protobuf.GeneratedMessage.Builder<Builder> {
+ private org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info result;
+
+ // Construct using protobuf.TermWeight.Info.newBuilder()
+ private Builder() {}
+
+ private static Builder create() {
+ Builder builder = new Builder();
+ builder.result = new org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info();
+ return builder;
+ }
+
+ protected org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info internalGetResult() {
+ return result;
+ }
+
+ public Builder clear() {
+ if (result == null) {
+ throw new IllegalStateException("Cannot call clear() after build().");
+ }
+ result = new org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info();
+ return this;
+ }
+
+ public Builder clone() {
+ return create().mergeFrom(result);
+ }
+
+ public com.google.protobuf.Descriptors.Descriptor getDescriptorForType() {
+ return org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info.getDescriptor();
+ }
+
+ public org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info getDefaultInstanceForType() {
+ return org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info.getDefaultInstance();
+ }
+
+ public boolean isInitialized() {
+ return result.isInitialized();
+ }
+
+ public org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info build() {
+ if (result != null && !isInitialized()) {
+ throw newUninitializedMessageException(result);
+ }
+ return buildPartial();
+ }
+
+ private org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info buildParsed() throws com.google.protobuf.InvalidProtocolBufferException {
+ if (!isInitialized()) {
+ throw newUninitializedMessageException(result).asInvalidProtocolBufferException();
+ }
+ return buildPartial();
+ }
+
+ public org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info buildPartial() {
+ if (result == null) {
+ throw new IllegalStateException("build() has already been called on this Builder.");
+ }
+ if (result.wordOffset_ != java.util.Collections.EMPTY_LIST) {
+ result.wordOffset_ = java.util.Collections.unmodifiableList(result.wordOffset_);
+ }
+ org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info returnMe = result;
+ result = null;
+ return returnMe;
+ }
+
+ public Builder mergeFrom(com.google.protobuf.Message other) {
+ if (other instanceof org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info) {
+ return mergeFrom((org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info) other);
+ } else {
+ super.mergeFrom(other);
+ return this;
+ }
+ }
+
+ public Builder mergeFrom(org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info other) {
+ if (other == org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info.getDefaultInstance())
+ return this;
+ if (other.hasNormalizedTermFrequency()) {
+ setNormalizedTermFrequency(other.getNormalizedTermFrequency());
+ }
+ if (!other.wordOffset_.isEmpty()) {
+ if (result.wordOffset_.isEmpty()) {
+ result.wordOffset_ = new java.util.ArrayList<java.lang.Integer>();
+ }
+ result.wordOffset_.addAll(other.wordOffset_);
+ }
+ this.mergeUnknownFields(other.getUnknownFields());
+ return this;
+ }
+
+ public Builder mergeFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ com.google.protobuf.UnknownFieldSet.Builder unknownFields = com.google.protobuf.UnknownFieldSet.newBuilder(this.getUnknownFields());
+ while (true) {
+ int tag = input.readTag();
+ switch (tag) {
+ case 0:
+ this.setUnknownFields(unknownFields.build());
+ return this;
+ default: {
+ if (!parseUnknownField(input, unknownFields, extensionRegistry, tag)) {
+ this.setUnknownFields(unknownFields.build());
+ return this;
+ }
+ break;
+ }
+ case 13: {
+ setNormalizedTermFrequency(input.readFloat());
+ break;
+ }
+ case 16: {
+ addWordOffset(input.readUInt32());
+ break;
+ }
+ case 18: {
+ int length = input.readRawVarint32();
+ int limit = input.pushLimit(length);
+ while (input.getBytesUntilLimit() > 0) {
+ addWordOffset(input.readUInt32());
+ }
+ input.popLimit(limit);
+ break;
+ }
+ }
+ }
+ }
+
+ // required float normalizedTermFrequency = 1;
+ public boolean hasNormalizedTermFrequency() {
+ return result.hasNormalizedTermFrequency();
+ }
+
+ public float getNormalizedTermFrequency() {
+ return result.getNormalizedTermFrequency();
+ }
+
+ public Builder setNormalizedTermFrequency(float value) {
+ result.hasNormalizedTermFrequency = true;
+ result.normalizedTermFrequency_ = value;
+ return this;
+ }
+
+ public Builder clearNormalizedTermFrequency() {
+ result.hasNormalizedTermFrequency = false;
+ result.normalizedTermFrequency_ = 0F;
+ return this;
+ }
+
+ // repeated uint32 wordOffset = 2;
+ public java.util.List<java.lang.Integer> getWordOffsetList() {
+ return java.util.Collections.unmodifiableList(result.wordOffset_);
+ }
+
+ public int getWordOffsetCount() {
+ return result.getWordOffsetCount();
+ }
+
+ public int getWordOffset(int index) {
+ return result.getWordOffset(index);
+ }
+
+ public Builder setWordOffset(int index, int value) {
+ result.wordOffset_.set(index, value);
+ return this;
+ }
+
+ public Builder addWordOffset(int value) {
+ if (result.wordOffset_.isEmpty()) {
+ result.wordOffset_ = new java.util.ArrayList<java.lang.Integer>();
+ }
+ result.wordOffset_.add(value);
+ return this;
+ }
+
+ public Builder addAllWordOffset(java.lang.Iterable<? extends java.lang.Integer> values) {
+ if (result.wordOffset_.isEmpty()) {
+ result.wordOffset_ = new java.util.ArrayList<java.lang.Integer>();
+ }
+ super.addAll(values, result.wordOffset_);
+ return this;
+ }
+
+ public Builder clearWordOffset() {
+ result.wordOffset_ = java.util.Collections.emptyList();
+ return this;
+ }
+
+ // @@protoc_insertion_point(builder_scope:protobuf.Info)
+ }
+
+ static {
+ defaultInstance = new Info(true);
+ org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.internalForceInit();
+ defaultInstance.initFields();
+ }
+
+ // @@protoc_insertion_point(class_scope:protobuf.Info)
+ }
+
+ private static com.google.protobuf.Descriptors.Descriptor internal_static_protobuf_Info_descriptor;
+ private static com.google.protobuf.GeneratedMessage.FieldAccessorTable internal_static_protobuf_Info_fieldAccessorTable;
+
+ public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
+ return descriptor;
+ }
+
+ private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
+ static {
+ java.lang.String[] descriptorData = {"\n\020TermWeight.proto\022\010protobuf\";\n\004Info\022\037\n\027"
+ + "normalizedTermFrequency\030\001 \002(\002\022\022\n\nwordOff" + "set\030\002 \003(\rB\014\n\010protobufH\001"};
+ com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner = new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() {
+ public com.google.protobuf.ExtensionRegistry assignDescriptors(com.google.protobuf.Descriptors.FileDescriptor root) {
+ descriptor = root;
+ internal_static_protobuf_Info_descriptor = getDescriptor().getMessageTypes().get(0);
+ internal_static_protobuf_Info_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable(
+ internal_static_protobuf_Info_descriptor, new java.lang.String[] {"NormalizedTermFrequency", "WordOffset",}, org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info.class,
+ org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info.Builder.class);
+ return null;
+ }
+ };
+ com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[] {},
+ assigner);
+ }
+
+ public static void internalForceInit() {}
+
+ // @@protoc_insertion_point(outer_class_scope)
+}
Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/TermWeight.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/Uid.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/Uid.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/Uid.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/Uid.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,470 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Generated by the protocol buffer compiler. DO NOT EDIT!
+// source: Uid.proto
+
+package org.apache.accumulo.examples.wikisearch.protobuf;
+
+public final class Uid {
+ private Uid() {}
+
+ public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {}
+
+ public static final class List extends com.google.protobuf.GeneratedMessage {
+ // Use List.newBuilder() to construct.
+ private List() {
+ initFields();
+ }
+
+ private List(boolean noInit) {}
+
+ private static final List defaultInstance;
+
+ public static List getDefaultInstance() {
+ return defaultInstance;
+ }
+
+ public List getDefaultInstanceForType() {
+ return defaultInstance;
+ }
+
+ public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() {
+ return org.apache.accumulo.examples.wikisearch.protobuf.Uid.internal_static_protobuf_List_descriptor;
+ }
+
+ protected com.google.protobuf.GeneratedMessage.FieldAccessorTable internalGetFieldAccessorTable() {
+ return org.apache.accumulo.examples.wikisearch.protobuf.Uid.internal_static_protobuf_List_fieldAccessorTable;
+ }
+
+ // required bool IGNORE = 1;
+ public static final int IGNORE_FIELD_NUMBER = 1;
+ private boolean hasIGNORE;
+ private boolean iGNORE_ = false;
+
+ public boolean hasIGNORE() {
+ return hasIGNORE;
+ }
+
+ public boolean getIGNORE() {
+ return iGNORE_;
+ }
+
+ // required uint64 COUNT = 2;
+ public static final int COUNT_FIELD_NUMBER = 2;
+ private boolean hasCOUNT;
+ private long cOUNT_ = 0L;
+
+ public boolean hasCOUNT() {
+ return hasCOUNT;
+ }
+
+ public long getCOUNT() {
+ return cOUNT_;
+ }
+
+ // repeated string UID = 3;
+ public static final int UID_FIELD_NUMBER = 3;
+ private java.util.List<java.lang.String> uID_ = java.util.Collections.emptyList();
+
+ public java.util.List<java.lang.String> getUIDList() {
+ return uID_;
+ }
+
+ public int getUIDCount() {
+ return uID_.size();
+ }
+
+ public java.lang.String getUID(int index) {
+ return uID_.get(index);
+ }
+
+ private void initFields() {}
+
+ public final boolean isInitialized() {
+ if (!hasIGNORE)
+ return false;
+ if (!hasCOUNT)
+ return false;
+ return true;
+ }
+
+ public void writeTo(com.google.protobuf.CodedOutputStream output) throws java.io.IOException {
+ getSerializedSize();
+ if (hasIGNORE()) {
+ output.writeBool(1, getIGNORE());
+ }
+ if (hasCOUNT()) {
+ output.writeUInt64(2, getCOUNT());
+ }
+ for (java.lang.String element : getUIDList()) {
+ output.writeString(3, element);
+ }
+ getUnknownFields().writeTo(output);
+ }
+
+ private int memoizedSerializedSize = -1;
+
+ public int getSerializedSize() {
+ int size = memoizedSerializedSize;
+ if (size != -1)
+ return size;
+
+ size = 0;
+ if (hasIGNORE()) {
+ size += com.google.protobuf.CodedOutputStream.computeBoolSize(1, getIGNORE());
+ }
+ if (hasCOUNT()) {
+ size += com.google.protobuf.CodedOutputStream.computeUInt64Size(2, getCOUNT());
+ }
+ {
+ int dataSize = 0;
+ for (java.lang.String element : getUIDList()) {
+ dataSize += com.google.protobuf.CodedOutputStream.computeStringSizeNoTag(element);
+ }
+ size += dataSize;
+ size += 1 * getUIDList().size();
+ }
+ size += getUnknownFields().getSerializedSize();
+ memoizedSerializedSize = size;
+ return size;
+ }
+
+ public static org.apache.accumulo.examples.wikisearch.protobuf.Uid.List parseFrom(com.google.protobuf.ByteString data) throws com.google.protobuf.InvalidProtocolBufferException {
+ return newBuilder().mergeFrom(data).buildParsed();
+ }
+
+ public static org.apache.accumulo.examples.wikisearch.protobuf.Uid.List parseFrom(com.google.protobuf.ByteString data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return newBuilder().mergeFrom(data, extensionRegistry).buildParsed();
+ }
+
+ public static org.apache.accumulo.examples.wikisearch.protobuf.Uid.List parseFrom(byte[] data) throws com.google.protobuf.InvalidProtocolBufferException {
+ return newBuilder().mergeFrom(data).buildParsed();
+ }
+
+ public static org.apache.accumulo.examples.wikisearch.protobuf.Uid.List parseFrom(byte[] data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return newBuilder().mergeFrom(data, extensionRegistry).buildParsed();
+ }
+
+ public static org.apache.accumulo.examples.wikisearch.protobuf.Uid.List parseFrom(java.io.InputStream input) throws java.io.IOException {
+ return newBuilder().mergeFrom(input).buildParsed();
+ }
+
+ public static org.apache.accumulo.examples.wikisearch.protobuf.Uid.List parseFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ return newBuilder().mergeFrom(input, extensionRegistry).buildParsed();
+ }
+
+ public static org.apache.accumulo.examples.wikisearch.protobuf.Uid.List parseDelimitedFrom(java.io.InputStream input) throws java.io.IOException {
+ Builder builder = newBuilder();
+ if (builder.mergeDelimitedFrom(input)) {
+ return builder.buildParsed();
+ } else {
+ return null;
+ }
+ }
+
+ public static org.apache.accumulo.examples.wikisearch.protobuf.Uid.List parseDelimitedFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ Builder builder = newBuilder();
+ if (builder.mergeDelimitedFrom(input, extensionRegistry)) {
+ return builder.buildParsed();
+ } else {
+ return null;
+ }
+ }
+
+ public static org.apache.accumulo.examples.wikisearch.protobuf.Uid.List parseFrom(com.google.protobuf.CodedInputStream input) throws java.io.IOException {
+ return newBuilder().mergeFrom(input).buildParsed();
+ }
+
+ public static org.apache.accumulo.examples.wikisearch.protobuf.Uid.List parseFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ return newBuilder().mergeFrom(input, extensionRegistry).buildParsed();
+ }
+
+ public static Builder newBuilder() {
+ return Builder.create();
+ }
+
+ public Builder newBuilderForType() {
+ return newBuilder();
+ }
+
+ public static Builder newBuilder(org.apache.accumulo.examples.wikisearch.protobuf.Uid.List prototype) {
+ return newBuilder().mergeFrom(prototype);
+ }
+
+ public Builder toBuilder() {
+ return newBuilder(this);
+ }
+
+ public static final class Builder extends com.google.protobuf.GeneratedMessage.Builder<Builder> {
+ private org.apache.accumulo.examples.wikisearch.protobuf.Uid.List result;
+
+ // Construct using protobuf.Uid.List.newBuilder()
+ private Builder() {}
+
+ private static Builder create() {
+ Builder builder = new Builder();
+ builder.result = new org.apache.accumulo.examples.wikisearch.protobuf.Uid.List();
+ return builder;
+ }
+
+ protected org.apache.accumulo.examples.wikisearch.protobuf.Uid.List internalGetResult() {
+ return result;
+ }
+
+ public Builder clear() {
+ if (result == null) {
+ throw new IllegalStateException("Cannot call clear() after build().");
+ }
+ result = new org.apache.accumulo.examples.wikisearch.protobuf.Uid.List();
+ return this;
+ }
+
+ public Builder clone() {
+ return create().mergeFrom(result);
+ }
+
+ public com.google.protobuf.Descriptors.Descriptor getDescriptorForType() {
+ return org.apache.accumulo.examples.wikisearch.protobuf.Uid.List.getDescriptor();
+ }
+
+ public org.apache.accumulo.examples.wikisearch.protobuf.Uid.List getDefaultInstanceForType() {
+ return org.apache.accumulo.examples.wikisearch.protobuf.Uid.List.getDefaultInstance();
+ }
+
+ public boolean isInitialized() {
+ return result.isInitialized();
+ }
+
+ public org.apache.accumulo.examples.wikisearch.protobuf.Uid.List build() {
+ if (result != null && !isInitialized()) {
+ throw newUninitializedMessageException(result);
+ }
+ return buildPartial();
+ }
+
+ private org.apache.accumulo.examples.wikisearch.protobuf.Uid.List buildParsed() throws com.google.protobuf.InvalidProtocolBufferException {
+ if (!isInitialized()) {
+ throw newUninitializedMessageException(result).asInvalidProtocolBufferException();
+ }
+ return buildPartial();
+ }
+
+ public org.apache.accumulo.examples.wikisearch.protobuf.Uid.List buildPartial() {
+ if (result == null) {
+ throw new IllegalStateException("build() has already been called on this Builder.");
+ }
+ if (result.uID_ != java.util.Collections.EMPTY_LIST) {
+ result.uID_ = java.util.Collections.unmodifiableList(result.uID_);
+ }
+ org.apache.accumulo.examples.wikisearch.protobuf.Uid.List returnMe = result;
+ result = null;
+ return returnMe;
+ }
+
+ public Builder mergeFrom(com.google.protobuf.Message other) {
+ if (other instanceof org.apache.accumulo.examples.wikisearch.protobuf.Uid.List) {
+ return mergeFrom((org.apache.accumulo.examples.wikisearch.protobuf.Uid.List) other);
+ } else {
+ super.mergeFrom(other);
+ return this;
+ }
+ }
+
+ public Builder mergeFrom(org.apache.accumulo.examples.wikisearch.protobuf.Uid.List other) {
+ if (other == org.apache.accumulo.examples.wikisearch.protobuf.Uid.List.getDefaultInstance())
+ return this;
+ if (other.hasIGNORE()) {
+ setIGNORE(other.getIGNORE());
+ }
+ if (other.hasCOUNT()) {
+ setCOUNT(other.getCOUNT());
+ }
+ if (!other.uID_.isEmpty()) {
+ if (result.uID_.isEmpty()) {
+ result.uID_ = new java.util.ArrayList<java.lang.String>();
+ }
+ result.uID_.addAll(other.uID_);
+ }
+ this.mergeUnknownFields(other.getUnknownFields());
+ return this;
+ }
+
+ public Builder mergeFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ com.google.protobuf.UnknownFieldSet.Builder unknownFields = com.google.protobuf.UnknownFieldSet.newBuilder(this.getUnknownFields());
+ while (true) {
+ int tag = input.readTag();
+ switch (tag) {
+ case 0:
+ this.setUnknownFields(unknownFields.build());
+ return this;
+ default: {
+ if (!parseUnknownField(input, unknownFields, extensionRegistry, tag)) {
+ this.setUnknownFields(unknownFields.build());
+ return this;
+ }
+ break;
+ }
+ case 8: {
+ setIGNORE(input.readBool());
+ break;
+ }
+ case 16: {
+ setCOUNT(input.readUInt64());
+ break;
+ }
+ case 26: {
+ addUID(input.readString());
+ break;
+ }
+ }
+ }
+ }
+
+ // required bool IGNORE = 1;
+ public boolean hasIGNORE() {
+ return result.hasIGNORE();
+ }
+
+ public boolean getIGNORE() {
+ return result.getIGNORE();
+ }
+
+ public Builder setIGNORE(boolean value) {
+ result.hasIGNORE = true;
+ result.iGNORE_ = value;
+ return this;
+ }
+
+ public Builder clearIGNORE() {
+ result.hasIGNORE = false;
+ result.iGNORE_ = false;
+ return this;
+ }
+
+ // required uint64 COUNT = 2;
+ public boolean hasCOUNT() {
+ return result.hasCOUNT();
+ }
+
+ public long getCOUNT() {
+ return result.getCOUNT();
+ }
+
+ public Builder setCOUNT(long value) {
+ result.hasCOUNT = true;
+ result.cOUNT_ = value;
+ return this;
+ }
+
+ public Builder clearCOUNT() {
+ result.hasCOUNT = false;
+ result.cOUNT_ = 0L;
+ return this;
+ }
+
+ // repeated string UID = 3;
+ public java.util.List<java.lang.String> getUIDList() {
+ return java.util.Collections.unmodifiableList(result.uID_);
+ }
+
+ public int getUIDCount() {
+ return result.getUIDCount();
+ }
+
+ public java.lang.String getUID(int index) {
+ return result.getUID(index);
+ }
+
+ public Builder setUID(int index, java.lang.String value) {
+ if (value == null) {
+ throw new NullPointerException();
+ }
+ result.uID_.set(index, value);
+ return this;
+ }
+
+ public Builder addUID(java.lang.String value) {
+ if (value == null) {
+ throw new NullPointerException();
+ }
+ if (result.uID_.isEmpty()) {
+ result.uID_ = new java.util.ArrayList<java.lang.String>();
+ }
+ result.uID_.add(value);
+ return this;
+ }
+
+ public Builder addAllUID(java.lang.Iterable<? extends java.lang.String> values) {
+ if (result.uID_.isEmpty()) {
+ result.uID_ = new java.util.ArrayList<java.lang.String>();
+ }
+ super.addAll(values, result.uID_);
+ return this;
+ }
+
+ public Builder clearUID() {
+ result.uID_ = java.util.Collections.emptyList();
+ return this;
+ }
+
+ // @@protoc_insertion_point(builder_scope:protobuf.List)
+ }
+
+ static {
+ defaultInstance = new List(true);
+ org.apache.accumulo.examples.wikisearch.protobuf.Uid.internalForceInit();
+ defaultInstance.initFields();
+ }
+
+ // @@protoc_insertion_point(class_scope:protobuf.List)
+ }
+
+ private static com.google.protobuf.Descriptors.Descriptor internal_static_protobuf_List_descriptor;
+ private static com.google.protobuf.GeneratedMessage.FieldAccessorTable internal_static_protobuf_List_fieldAccessorTable;
+
+ public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
+ return descriptor;
+ }
+
+ private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
+ static {
+ java.lang.String[] descriptorData = {"\n\tUid.proto\022\010protobuf\"2\n\004List\022\016\n\006IGNORE\030"
+ + "\001 \002(\010\022\r\n\005COUNT\030\002 \002(\004\022\013\n\003UID\030\003 \003(\tB\014\n\010pro" + "tobufH\001"};
+ com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner = new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() {
+ public com.google.protobuf.ExtensionRegistry assignDescriptors(com.google.protobuf.Descriptors.FileDescriptor root) {
+ descriptor = root;
+ internal_static_protobuf_List_descriptor = getDescriptor().getMessageTypes().get(0);
+ internal_static_protobuf_List_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable(
+ internal_static_protobuf_List_descriptor, new java.lang.String[] {"IGNORE", "COUNT", "UID",}, org.apache.accumulo.examples.wikisearch.protobuf.Uid.List.class,
+ org.apache.accumulo.examples.wikisearch.protobuf.Uid.List.Builder.class);
+ return null;
+ }
+ };
+ com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[] {},
+ assigner);
+ }
+
+ public static void internalForceInit() {}
+
+ // @@protoc_insertion_point(outer_class_scope)
+}
Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/Uid.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/AggregatingRecordReader.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/AggregatingRecordReader.java?rev=1230608&view=auto
==============================================================================
--- incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/AggregatingRecordReader.java (added)
+++ incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/AggregatingRecordReader.java Thu Jan 12 16:06:14 2012
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.examples.wikisearch.reader;
+
+
+import java.io.IOException;
+
+import org.apache.accumulo.examples.wikisearch.ingest.WikipediaConfiguration;
+import org.apache.accumulo.examples.wikisearch.util.TextUtil;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+
+
+/**
+ * This class aggregates Text values based on a start and end filter. An example use case for this would be XML data. This will not work with data that has
+ * nested start and stop tokens.
+ *
+ */
+public class AggregatingRecordReader extends LongLineRecordReader {
+
+ public static final String START_TOKEN = "aggregating.token.start";
+ public static final String END_TOKEN = "aggregating.token.end";
+ public static final String RETURN_PARTIAL_MATCHES = "aggregating.allow.partial";
+
+ private LongWritable key = new LongWritable();
+ private String startToken = null;
+ private String endToken = null;
+ private long counter = 0;
+ private Text aggValue = new Text();
+ private boolean startFound = false;
+ private StringBuilder remainder = new StringBuilder(0);
+ private boolean returnPartialMatches = false;
+
+ @Override
+ public LongWritable getCurrentKey() {
+ key.set(counter);
+ return key;
+ }
+
+ @Override
+ public Text getCurrentValue() {
+ return aggValue;
+ }
+
+ @Override
+ public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
+ super.initialize(genericSplit, context);
+ this.startToken = WikipediaConfiguration.isNull(context.getConfiguration(), START_TOKEN, String.class);
+ this.endToken = WikipediaConfiguration.isNull(context.getConfiguration(), END_TOKEN, String.class);
+ this.returnPartialMatches = context.getConfiguration().getBoolean(RETURN_PARTIAL_MATCHES, false);
+
+ /*
+ * Text-appending works almost exactly like the + operator on Strings- it creates a byte array exactly the size of [prefix + suffix] and dumps the bytes
+ * into the new array. This module works by doing lots of little additions, one line at a time. With most XML, the documents are partitioned on line
+ * boundaries, so we will generally have lots of additions. Setting a large default byte array for a text object can avoid this and give us
+ * StringBuilder-like functionality for Text objects.
+ */
+ byte[] txtBuffer = new byte[2048];
+ aggValue.set(txtBuffer);
+ }
+
+ @Override
+ public boolean nextKeyValue() throws IOException {
+ aggValue.clear();
+ boolean hasNext = false;
+ boolean finished = false;
+ // Find the start token
+ while (!finished && (((hasNext = super.nextKeyValue()) == true) || remainder.length() > 0)) {
+ if (hasNext)
+ finished = process(super.getCurrentValue());
+ else
+ finished = process(null);
+ if (finished) {
+ startFound = false;
+ counter++;
+ return true;
+ }
+ }
+ // If we have anything loaded in the agg value (and we found a start)
+ // then we ran out of data before finding the end. Just return the
+ // data we have and if it's not valid, downstream parsing of the data
+ // will fail.
+ if (returnPartialMatches && startFound && aggValue.getLength() > 0) {
+ startFound = false;
+ counter++;
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Populates aggValue with the contents of the Text object.
+ *
+ * @param t
+ * @return true if aggValue is complete, else false and needs more data.
+ */
+ private boolean process(Text t) {
+
+ if (null != t)
+ remainder.append(t.toString());
+ while (remainder.length() > 0) {
+ if (!startFound) {
+ // If found, then begin aggregating at the start offset
+ int start = remainder.indexOf(startToken);
+ if (-1 != start) {
+ // Append the start token to the aggregate value
+ TextUtil.textAppendNoNull(aggValue, remainder.substring(start, start + startToken.length()), false);
+ // Remove to the end of the start token from the remainder
+ remainder.delete(0, start + startToken.length());
+ startFound = true;
+ } else {
+ // If we are looking for the start and have not found it, then remove
+ // the bytes
+ remainder.delete(0, remainder.length());
+ }
+ } else {
+ // Try to find the end
+ int end = remainder.indexOf(endToken);
+ // Also try to find the start
+ int start = remainder.indexOf(startToken);
+ if (-1 == end) {
+ if (returnPartialMatches && start >= 0) {
+ // End token not found, but another start token was found...
+ // The amount to copy is up to the beginning of the next start token
+ TextUtil.textAppendNoNull(aggValue, remainder.substring(0, start), false);
+ remainder.delete(0, start);
+ return true;
+ } else {
+ // Not found, aggregate the entire remainder
+ TextUtil.textAppendNoNull(aggValue, remainder.toString(), false);
+ // Delete all chars from remainder
+ remainder.delete(0, remainder.length());
+ }
+ } else {
+ if (returnPartialMatches && start >= 0 && start < end) {
+ // We found the end token, but found another start token first, so
+ // deal with that.
+ TextUtil.textAppendNoNull(aggValue, remainder.substring(0, start), false);
+ remainder.delete(0, start);
+ return true;
+ } else {
+ // END_TOKEN was found. Extract to the end of END_TOKEN
+ TextUtil.textAppendNoNull(aggValue, remainder.substring(0, end + endToken.length()), false);
+ // Remove from remainder up to the end of END_TOKEN
+ remainder.delete(0, end + endToken.length());
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+}
Propchange: incubator/accumulo/trunk/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/reader/AggregatingRecordReader.java
------------------------------------------------------------------------------
svn:eol-style = native