You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@accumulo.apache.org by ec...@apache.org on 2013/11/26 16:50:02 UTC
[14/40] ACCUMULO-600 removed wikisearch from trunk
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaPartitionedIngester.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaPartitionedIngester.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaPartitionedIngester.java
deleted file mode 100644
index 90b8308..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaPartitionedIngester.java
+++ /dev/null
@@ -1,310 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.ingest;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.EnumSet;
-import java.util.List;
-import java.util.Set;
-import java.util.SortedSet;
-import java.util.TreeSet;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.accumulo.core.client.AccumuloException;
-import org.apache.accumulo.core.client.AccumuloSecurityException;
-import org.apache.accumulo.core.client.Connector;
-import org.apache.accumulo.core.client.IteratorSetting;
-import org.apache.accumulo.core.client.IteratorSetting.Column;
-import org.apache.accumulo.core.client.TableExistsException;
-import org.apache.accumulo.core.client.TableNotFoundException;
-import org.apache.accumulo.core.client.admin.TableOperations;
-import org.apache.accumulo.core.client.mapreduce.AccumuloOutputFormat;
-import org.apache.accumulo.core.data.Mutation;
-import org.apache.accumulo.core.iterators.IteratorUtil.IteratorScope;
-import org.apache.accumulo.core.iterators.user.SummingCombiner;
-import org.apache.accumulo.core.tabletserver.thrift.MutationLogger.log_args;
-import org.apache.accumulo.examples.wikisearch.ingest.ArticleExtractor.Article;
-import org.apache.accumulo.examples.wikisearch.iterator.GlobalIndexUidCombiner;
-import org.apache.accumulo.examples.wikisearch.iterator.TextIndexCombiner;
-import org.apache.accumulo.examples.wikisearch.output.SortingRFileOutputFormat;
-import org.apache.accumulo.examples.wikisearch.reader.AggregatingRecordReader;
-import org.apache.commons.lang.StringUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-import org.apache.hadoop.io.SequenceFile.CompressionType;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
-import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
-import org.apache.hadoop.util.Tool;
-import org.apache.hadoop.util.ToolRunner;
-import org.apache.log4j.Logger;
-
-public class WikipediaPartitionedIngester extends Configured implements Tool {
-
- private static final Logger log = Logger.getLogger(WikipediaPartitionedIngester.class);
-
- public final static String INGEST_LANGUAGE = "wikipedia.ingest_language";
- public final static String SPLIT_FILE = "wikipedia.split_file";
- public final static String TABLE_NAME = "wikipedia.table";
-
- public static void main(String[] args) throws Exception {
- int res = ToolRunner.run(new Configuration(), new WikipediaPartitionedIngester(), args);
- System.exit(res);
- }
-
- private void createTables(TableOperations tops, String tableName) throws AccumuloException, AccumuloSecurityException, TableNotFoundException,
- TableExistsException {
- // Create the shard table
- String indexTableName = tableName + "Index";
- String reverseIndexTableName = tableName + "ReverseIndex";
- String metadataTableName = tableName + "Metadata";
-
- // create the shard table
- if (!tops.exists(tableName)) {
- // Set a text index combiner on the given field names. No combiner is set if the option is not supplied
- String textIndexFamilies = WikipediaMapper.TOKENS_FIELD_NAME;
-
- tops.create(tableName);
- if (textIndexFamilies.length() > 0) {
- System.out.println("Adding content combiner on the fields: " + textIndexFamilies);
-
- IteratorSetting setting = new IteratorSetting(10, TextIndexCombiner.class);
- List<Column> columns = new ArrayList<Column>();
- for (String family : StringUtils.split(textIndexFamilies, ',')) {
- columns.add(new Column("fi\0" + family));
- }
- TextIndexCombiner.setColumns(setting, columns);
- TextIndexCombiner.setLossyness(setting, true);
-
- tops.attachIterator(tableName, setting, EnumSet.allOf(IteratorScope.class));
- }
-
- // Set the locality group for the full content column family
- tops.setLocalityGroups(tableName, Collections.singletonMap("WikipediaDocuments", Collections.singleton(new Text(WikipediaMapper.DOCUMENT_COLUMN_FAMILY))));
-
- }
-
- if (!tops.exists(indexTableName)) {
- tops.create(indexTableName);
- // Add the UID combiner
- IteratorSetting setting = new IteratorSetting(19, "UIDAggregator", GlobalIndexUidCombiner.class);
- GlobalIndexUidCombiner.setCombineAllColumns(setting, true);
- GlobalIndexUidCombiner.setLossyness(setting, true);
- tops.attachIterator(indexTableName, setting, EnumSet.allOf(IteratorScope.class));
- }
-
- if (!tops.exists(reverseIndexTableName)) {
- tops.create(reverseIndexTableName);
- // Add the UID combiner
- IteratorSetting setting = new IteratorSetting(19, "UIDAggregator", GlobalIndexUidCombiner.class);
- GlobalIndexUidCombiner.setCombineAllColumns(setting, true);
- GlobalIndexUidCombiner.setLossyness(setting, true);
- tops.attachIterator(reverseIndexTableName, setting, EnumSet.allOf(IteratorScope.class));
- }
-
- if (!tops.exists(metadataTableName)) {
- // Add the SummingCombiner with VARLEN encoding for the frequency column
- tops.create(metadataTableName);
- IteratorSetting setting = new IteratorSetting(10, SummingCombiner.class);
- SummingCombiner.setColumns(setting, Collections.singletonList(new Column("f")));
- SummingCombiner.setEncodingType(setting, SummingCombiner.Type.VARLEN);
- tops.attachIterator(metadataTableName, setting, EnumSet.allOf(IteratorScope.class));
- }
- }
-
- @Override
- public int run(String[] args) throws Exception {
- Configuration conf = getConf();
- if(WikipediaConfiguration.runPartitioner(conf))
- {
- int result = runPartitionerJob();
- if(result != 0)
- return result;
- }
- if(WikipediaConfiguration.runIngest(conf))
- {
- int result = runIngestJob();
- if(result != 0)
- return result;
- if(WikipediaConfiguration.bulkIngest(conf))
- return loadBulkFiles();
- }
- return 0;
- }
-
- private int runPartitionerJob() throws Exception
- {
- Job partitionerJob = new Job(getConf(), "Partition Wikipedia");
- Configuration partitionerConf = partitionerJob.getConfiguration();
- partitionerConf.set("mapred.map.tasks.speculative.execution", "false");
-
- configurePartitionerJob(partitionerJob);
-
- List<Path> inputPaths = new ArrayList<Path>();
- SortedSet<String> languages = new TreeSet<String>();
- FileSystem fs = FileSystem.get(partitionerConf);
- Path parent = new Path(partitionerConf.get("wikipedia.input"));
- listFiles(parent, fs, inputPaths, languages);
-
- System.out.println("Input files in " + parent + ":" + inputPaths.size());
- Path[] inputPathsArray = new Path[inputPaths.size()];
- inputPaths.toArray(inputPathsArray);
-
- System.out.println("Languages:" + languages.size());
-
- // setup input format
-
- WikipediaInputFormat.setInputPaths(partitionerJob, inputPathsArray);
-
- partitionerJob.setMapperClass(WikipediaPartitioner.class);
- partitionerJob.setNumReduceTasks(0);
-
- // setup output format
- partitionerJob.setMapOutputKeyClass(Text.class);
- partitionerJob.setMapOutputValueClass(Article.class);
- partitionerJob.setOutputKeyClass(Text.class);
- partitionerJob.setOutputValueClass(Article.class);
- partitionerJob.setOutputFormatClass(SequenceFileOutputFormat.class);
- Path outputDir = WikipediaConfiguration.getPartitionedArticlesPath(partitionerConf);
- SequenceFileOutputFormat.setOutputPath(partitionerJob, outputDir);
- SequenceFileOutputFormat.setCompressOutput(partitionerJob, true);
- SequenceFileOutputFormat.setOutputCompressionType(partitionerJob, CompressionType.RECORD);
-
- return partitionerJob.waitForCompletion(true) ? 0 : 1;
- }
-
- private int runIngestJob() throws Exception
- {
- Job ingestJob = new Job(getConf(), "Ingest Partitioned Wikipedia");
- Configuration ingestConf = ingestJob.getConfiguration();
- ingestConf.set("mapred.map.tasks.speculative.execution", "false");
-
- configureIngestJob(ingestJob);
-
- String tablename = WikipediaConfiguration.getTableName(ingestConf);
-
- Connector connector = WikipediaConfiguration.getConnector(ingestConf);
-
- TableOperations tops = connector.tableOperations();
-
- createTables(tops, tablename);
-
- ingestJob.setMapperClass(WikipediaPartitionedMapper.class);
- ingestJob.setNumReduceTasks(0);
-
- // setup input format
- ingestJob.setInputFormatClass(SequenceFileInputFormat.class);
- SequenceFileInputFormat.setInputPaths(ingestJob, WikipediaConfiguration.getPartitionedArticlesPath(ingestConf));
- // TODO make split size configurable
- SequenceFileInputFormat.setMinInputSplitSize(ingestJob, WikipediaConfiguration.getMinInputSplitSize(ingestConf));
-
- // setup output format
- ingestJob.setMapOutputKeyClass(Text.class);
- ingestJob.setMapOutputValueClass(Mutation.class);
-
- if(WikipediaConfiguration.bulkIngest(ingestConf))
- {
- ingestJob.setOutputFormatClass(SortingRFileOutputFormat.class);
- SortingRFileOutputFormat.setMaxBufferSize(ingestConf, WikipediaConfiguration.bulkIngestBufferSize(ingestConf));
- String bulkIngestDir = WikipediaConfiguration.bulkIngestDir(ingestConf);
- if(bulkIngestDir == null)
- {
- log.error("Bulk ingest dir not set");
- return 1;
- }
- SortingRFileOutputFormat.setPathName(ingestConf, WikipediaConfiguration.bulkIngestDir(ingestConf));
- } else {
- ingestJob.setOutputFormatClass(AccumuloOutputFormat.class);
- String zookeepers = WikipediaConfiguration.getZookeepers(ingestConf);
- String instanceName = WikipediaConfiguration.getInstanceName(ingestConf);
- String user = WikipediaConfiguration.getUser(ingestConf);
- byte[] password = WikipediaConfiguration.getPassword(ingestConf);
- AccumuloOutputFormat.setOutputInfo(ingestJob.getConfiguration(), user, password, true, tablename);
- AccumuloOutputFormat.setZooKeeperInstance(ingestJob.getConfiguration(), instanceName, zookeepers);
- }
-
- return ingestJob.waitForCompletion(true) ? 0 : 1;
- }
-
- private int loadBulkFiles() throws IOException, AccumuloException, AccumuloSecurityException, TableNotFoundException
- {
- Configuration conf = getConf();
-
- Connector connector = WikipediaConfiguration.getConnector(conf);
-
- FileSystem fs = FileSystem.get(conf);
- String directory = WikipediaConfiguration.bulkIngestDir(conf);
-
- String failureDirectory = WikipediaConfiguration.bulkIngestFailureDir(conf);
-
- for(FileStatus status: fs.listStatus(new Path(directory)))
- {
- if(status.isDir() == false)
- continue;
- Path dir = status.getPath();
- Path failPath = new Path(failureDirectory+"/"+dir.getName());
- fs.mkdirs(failPath);
- connector.tableOperations().importDirectory(dir.getName(), dir.toString(), failPath.toString(), true);
- }
-
- return 0;
- }
-
- public final static PathFilter partFilter = new PathFilter() {
- @Override
- public boolean accept(Path path) {
- return path.getName().startsWith("part");
- };
- };
-
- protected void configurePartitionerJob(Job job) {
- Configuration conf = job.getConfiguration();
- job.setJarByClass(WikipediaPartitionedIngester.class);
- job.setInputFormatClass(WikipediaInputFormat.class);
- conf.set(AggregatingRecordReader.START_TOKEN, "<page>");
- conf.set(AggregatingRecordReader.END_TOKEN, "</page>");
- }
-
- protected void configureIngestJob(Job job) {
- job.setJarByClass(WikipediaPartitionedIngester.class);
- }
-
- protected static final Pattern filePattern = Pattern.compile("([a-z_]+).*.xml(.bz2)?");
-
- protected void listFiles(Path path, FileSystem fs, List<Path> files, Set<String> languages) throws IOException {
- for (FileStatus status : fs.listStatus(path)) {
- if (status.isDir()) {
- listFiles(status.getPath(), fs, files, languages);
- } else {
- Path p = status.getPath();
- Matcher matcher = filePattern.matcher(p.getName());
- if (matcher.matches()) {
- languages.add(matcher.group(1));
- files.add(p);
- }
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaPartitionedMapper.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaPartitionedMapper.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaPartitionedMapper.java
deleted file mode 100644
index bb4ae64..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaPartitionedMapper.java
+++ /dev/null
@@ -1,310 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- *
- */
-package org.apache.accumulo.examples.wikisearch.ingest;
-
-
-import java.io.IOException;
-import java.nio.charset.Charset;
-import java.util.HashSet;
-import java.util.Map.Entry;
-import java.util.Set;
-
-import org.apache.accumulo.core.client.AccumuloException;
-import org.apache.accumulo.core.client.AccumuloSecurityException;
-import org.apache.accumulo.core.client.MultiTableBatchWriter;
-import org.apache.accumulo.core.client.MutationsRejectedException;
-import org.apache.accumulo.core.data.Mutation;
-import org.apache.accumulo.core.data.Value;
-import org.apache.accumulo.core.security.ColumnVisibility;
-import org.apache.accumulo.examples.wikisearch.ingest.ArticleExtractor.Article;
-import org.apache.accumulo.examples.wikisearch.iterator.GlobalIndexUidCombiner;
-import org.apache.accumulo.examples.wikisearch.normalizer.LcNoDiacriticsNormalizer;
-import org.apache.accumulo.examples.wikisearch.protobuf.Uid;
-import org.apache.commons.codec.binary.Base64;
-import org.apache.commons.lang.StringUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.Multimap;
-
-public class WikipediaPartitionedMapper extends Mapper<Text,Article,Text,Mutation> {
-
- // private static final Logger log = Logger.getLogger(WikipediaPartitionedMapper.class);
-
- public final static Charset UTF8 = Charset.forName("UTF-8");
- public static final String DOCUMENT_COLUMN_FAMILY = "d";
- public static final String METADATA_EVENT_COLUMN_FAMILY = "e";
- public static final String METADATA_INDEX_COLUMN_FAMILY = "i";
- public static final String TOKENS_FIELD_NAME = "TEXT";
-
- private static final Value NULL_VALUE = new Value(new byte[0]);
- private static final String cvPrefix = "all|";
-
- private int numPartitions = 0;
-
- private Text tablename = null;
- private Text indexTableName = null;
- private Text reverseIndexTableName = null;
- private Text metadataTableName = null;
-
- private static class MutationInfo {
- final String row;
- final String colfam;
- final String colqual;
- final ColumnVisibility cv;
- final long timestamp;
-
- public MutationInfo(String row, String colfam, String colqual, ColumnVisibility cv, long timestamp) {
- super();
- this.row = row;
- this.colfam = colfam;
- this.colqual = colqual;
- this.cv = cv;
- this.timestamp = timestamp;
- }
-
- @Override
- public boolean equals(Object obj) {
- MutationInfo other = (MutationInfo)obj;
- return (row == other.row || row.equals(other.row)) &&
- (colfam == other.colfam || colfam.equals(other.colfam)) &&
- colqual.equals(other.colqual) &&
- (cv == other.cv || cv.equals(other.cv)) &&
- timestamp == other.timestamp;
- }
-
- @Override
- public int hashCode() {
- return row.hashCode() ^ colfam.hashCode() ^ colqual.hashCode() ^ cv.hashCode() ^ (int)timestamp;
- }
- }
-
- private LRUOutputCombiner<MutationInfo,CountAndSet> wikiIndexOutput;
- private LRUOutputCombiner<MutationInfo,CountAndSet> wikiReverseIndexOutput;
- private LRUOutputCombiner<MutationInfo,Value> wikiMetadataOutput;
-
- private static class CountAndSet
- {
- public int count;
- public HashSet<String> set;
-
- public CountAndSet(String entry)
- {
- set = new HashSet<String>();
- set.add(entry);
- count = 1;
- }
- }
-
- MultiTableBatchWriter mtbw;
-
- @Override
- public void setup(final Context context) {
- Configuration conf = context.getConfiguration();
- tablename = new Text(WikipediaConfiguration.getTableName(conf));
- indexTableName = new Text(tablename + "Index");
- reverseIndexTableName = new Text(tablename + "ReverseIndex");
- metadataTableName = new Text(tablename + "Metadata");
-
- try {
- mtbw = WikipediaConfiguration.getConnector(conf).createMultiTableBatchWriter(10000000, 1000, 10);
- } catch (AccumuloException e) {
- throw new RuntimeException(e);
- } catch (AccumuloSecurityException e) {
- throw new RuntimeException(e);
- }
-
- final Text metadataTableNameFinal = metadataTableName;
- final Text indexTableNameFinal = indexTableName;
- final Text reverseIndexTableNameFinal = reverseIndexTableName;
-
- numPartitions = WikipediaConfiguration.getNumPartitions(conf);
-
- LRUOutputCombiner.Fold<CountAndSet> indexFold =
- new LRUOutputCombiner.Fold<CountAndSet>() {
- @Override
- public CountAndSet fold(CountAndSet oldValue, CountAndSet newValue) {
- oldValue.count += newValue.count;
- if(oldValue.set == null || newValue.set == null)
- {
- oldValue.set = null;
- return oldValue;
- }
- oldValue.set.addAll(newValue.set);
- if(oldValue.set.size() > GlobalIndexUidCombiner.MAX)
- oldValue.set = null;
- return oldValue;
- }
- };
- LRUOutputCombiner.Output<MutationInfo,CountAndSet> indexOutput =
- new LRUOutputCombiner.Output<WikipediaPartitionedMapper.MutationInfo,CountAndSet>() {
-
- @Override
- public void output(MutationInfo key, CountAndSet value)
- {
- Uid.List.Builder builder = Uid.List.newBuilder();
- builder.setCOUNT(value.count);
- if (value.set == null) {
- builder.setIGNORE(true);
- builder.clearUID();
- } else {
- builder.setIGNORE(false);
- builder.addAllUID(value.set);
- }
- Uid.List list = builder.build();
- Value val = new Value(list.toByteArray());
- Mutation m = new Mutation(key.row);
- m.put(key.colfam, key.colqual, key.cv, key.timestamp, val);
- try {
- mtbw.getBatchWriter(indexTableNameFinal.toString()).addMutation(m);
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
- }
- };
- LRUOutputCombiner.Output<MutationInfo,CountAndSet> reverseIndexOutput =
- new LRUOutputCombiner.Output<WikipediaPartitionedMapper.MutationInfo,CountAndSet>() {
-
- @Override
- public void output(MutationInfo key, CountAndSet value)
- {
- Uid.List.Builder builder = Uid.List.newBuilder();
- builder.setCOUNT(value.count);
- if (value.set == null) {
- builder.setIGNORE(true);
- builder.clearUID();
- } else {
- builder.setIGNORE(false);
- builder.addAllUID(value.set);
- }
- Uid.List list = builder.build();
- Value val = new Value(list.toByteArray());
- Mutation m = new Mutation(key.row);
- m.put(key.colfam, key.colqual, key.cv, key.timestamp, val);
- try {
- mtbw.getBatchWriter(reverseIndexTableNameFinal.toString()).addMutation(m);
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
- }
- };
-
- wikiIndexOutput = new LRUOutputCombiner<WikipediaPartitionedMapper.MutationInfo,CountAndSet>(10000,indexFold,indexOutput);
- wikiReverseIndexOutput = new LRUOutputCombiner<WikipediaPartitionedMapper.MutationInfo,CountAndSet>(10000, indexFold,reverseIndexOutput);
- wikiMetadataOutput = new LRUOutputCombiner<WikipediaPartitionedMapper.MutationInfo,Value>(10000,
- new LRUOutputCombiner.Fold<Value>() {
- @Override
- public Value fold(Value oldValue, Value newValue) {
- return oldValue;
- }},
- new LRUOutputCombiner.Output<MutationInfo,Value>() {
- @Override
- public void output(MutationInfo key, Value value) {
- Mutation m = new Mutation(key.row);
- m.put(key.colfam, key.colqual, key.cv, key.timestamp, value);
- try {
- mtbw.getBatchWriter(metadataTableNameFinal.toString()).addMutation(m);
- } catch (Exception e) {
- throw new RuntimeException(e);
- }
- }});
- }
-
-
-
- @Override
- protected void cleanup(Context context) throws IOException, InterruptedException {
- wikiIndexOutput.flush();
- wikiMetadataOutput.flush();
- wikiReverseIndexOutput.flush();
- try {
- mtbw.close();
- } catch (MutationsRejectedException e) {
- throw new RuntimeException(e);
- }
- }
-
-
-
- @Override
- protected void map(Text language, Article article, Context context) throws IOException, InterruptedException {
- String NULL_BYTE = "\u0000";
- String colfPrefix = language.toString() + NULL_BYTE;
- String indexPrefix = "fi" + NULL_BYTE;
- ColumnVisibility cv = new ColumnVisibility(cvPrefix + language);
-
- if (article != null) {
- Text partitionId = new Text(Integer.toString(WikipediaMapper.getPartitionId(article, numPartitions)));
-
- // Create the mutations for the document.
- // Row is partition id, colf is language0articleid, colq is fieldName\0fieldValue
- Mutation m = new Mutation(partitionId);
- for (Entry<String,Object> entry : article.getFieldValues().entrySet()) {
- m.put(colfPrefix + article.getId(), entry.getKey() + NULL_BYTE + entry.getValue().toString(), cv, article.getTimestamp(), NULL_VALUE);
- // Create mutations for the metadata table.
- MutationInfo mm = new MutationInfo(entry.getKey(), METADATA_EVENT_COLUMN_FAMILY, language.toString(), cv, article.getTimestamp());
- wikiMetadataOutput.put(mm, NULL_VALUE);
- }
-
- // Tokenize the content
- Set<String> tokens = WikipediaMapper.getTokens(article);
-
- // We are going to put the fields to be indexed into a multimap. This allows us to iterate
- // over the entire set once.
- Multimap<String,String> indexFields = HashMultimap.create();
- // Add the normalized field values
- LcNoDiacriticsNormalizer normalizer = new LcNoDiacriticsNormalizer();
- for (Entry<String,String> index : article.getNormalizedFieldValues().entrySet())
- indexFields.put(index.getKey(), index.getValue());
- // Add the tokens
- for (String token : tokens)
- indexFields.put(TOKENS_FIELD_NAME, normalizer.normalizeFieldValue("", token));
-
- for (Entry<String,String> index : indexFields.entries()) {
- // Create mutations for the in partition index
- // Row is partition id, colf is 'fi'\0fieldName, colq is fieldValue\0language\0article id
- m.put(indexPrefix + index.getKey(), index.getValue() + NULL_BYTE + colfPrefix + article.getId(), cv, article.getTimestamp(), NULL_VALUE);
-
- // Create mutations for the global index
- // Row is field value, colf is field name, colq is partitionid\0language, value is Uid.List object
- MutationInfo gm = new MutationInfo(index.getValue(),index.getKey(),partitionId + NULL_BYTE + language, cv, article.getTimestamp());
- wikiIndexOutput.put(gm, new CountAndSet(Integer.toString(article.getId())));
-
- // Create mutations for the global reverse index
- MutationInfo grm = new MutationInfo(StringUtils.reverse(index.getValue()),index.getKey(),partitionId + NULL_BYTE + language, cv, article.getTimestamp());
- wikiReverseIndexOutput.put(grm, new CountAndSet(Integer.toString(article.getId())));
-
- // Create mutations for the metadata table.
- MutationInfo mm = new MutationInfo(index.getKey(),METADATA_INDEX_COLUMN_FAMILY, language + NULL_BYTE + LcNoDiacriticsNormalizer.class.getName(), cv, article.getTimestamp());
- wikiMetadataOutput.put(mm, NULL_VALUE);
- }
- // Add the entire text to the document section of the table.
- // row is the partition, colf is 'd', colq is language\0articleid, value is Base64 encoded GZIP'd document
- m.put(DOCUMENT_COLUMN_FAMILY, colfPrefix + article.getId(), cv, article.getTimestamp(), new Value(Base64.encodeBase64(article.getText().getBytes())));
- context.write(tablename, m);
-
- } else {
- context.getCounter("wikipedia", "invalid articles").increment(1);
- }
- context.progress();
- }
-}
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaPartitioner.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaPartitioner.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaPartitioner.java
deleted file mode 100644
index 3507108..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaPartitioner.java
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- *
- */
-package org.apache.accumulo.examples.wikisearch.ingest;
-
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.nio.charset.Charset;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.accumulo.examples.wikisearch.ingest.ArticleExtractor.Article;
-import org.apache.accumulo.examples.wikisearch.ingest.WikipediaInputFormat.WikipediaInputSplit;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.mapreduce.lib.input.FileSplit;
-
-public class WikipediaPartitioner extends Mapper<LongWritable,Text,Text,Article> {
-
- // private static final Logger log = Logger.getLogger(WikipediaPartitioner.class);
-
- public final static Charset UTF8 = Charset.forName("UTF-8");
- public static final String DOCUMENT_COLUMN_FAMILY = "d";
- public static final String METADATA_EVENT_COLUMN_FAMILY = "e";
- public static final String METADATA_INDEX_COLUMN_FAMILY = "i";
- public static final String TOKENS_FIELD_NAME = "TEXT";
-
- private final static Pattern languagePattern = Pattern.compile("([a-z_]+).*.xml(.bz2)?");
-
- private ArticleExtractor extractor;
- private String language;
-
- private int myGroup = -1;
- private int numGroups = -1;
-
- @Override
- public void setup(Context context) {
- Configuration conf = context.getConfiguration();
-
- WikipediaInputSplit wiSplit = (WikipediaInputSplit)context.getInputSplit();
- myGroup = wiSplit.getPartition();
- numGroups = WikipediaConfiguration.getNumGroups(conf);
-
- FileSplit split = wiSplit.getFileSplit();
- String fileName = split.getPath().getName();
- Matcher matcher = languagePattern.matcher(fileName);
- if (matcher.matches()) {
- language = matcher.group(1).replace('_', '-').toLowerCase();
- } else {
- throw new RuntimeException("Unknown ingest language! " + fileName);
- }
- extractor = new ArticleExtractor();
- }
-
- @Override
- protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
- Article article = extractor.extract(new InputStreamReader(new ByteArrayInputStream(value.getBytes()), UTF8));
- if (article != null) {
- int groupId = WikipediaMapper.getPartitionId(article, numGroups);
- if(groupId != myGroup)
- return;
- context.write(new Text(language), article);
- } else {
- context.getCounter("wikipedia", "invalid articles").increment(1);
- context.progress();
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/GlobalIndexUidCombiner.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/GlobalIndexUidCombiner.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/GlobalIndexUidCombiner.java
deleted file mode 100644
index 4702521..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/GlobalIndexUidCombiner.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.iterator;
-
-import java.io.IOException;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Map;
-
-import org.apache.accumulo.core.data.Key;
-import org.apache.accumulo.core.data.Value;
-import org.apache.accumulo.core.iterators.IteratorEnvironment;
-import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
-import org.apache.accumulo.core.iterators.TypedValueCombiner;
-import org.apache.accumulo.core.iterators.ValueFormatException;
-import org.apache.accumulo.examples.wikisearch.protobuf.Uid;
-
-import com.google.protobuf.InvalidProtocolBufferException;
-
-/**
- *
- */
-public class GlobalIndexUidCombiner extends TypedValueCombiner<Uid.List> {
- public static final Encoder<Uid.List> UID_LIST_ENCODER = new UidListEncoder();
- public static final int MAX = 20;
-
- @Override
- public void init(SortedKeyValueIterator<Key,Value> source, Map<String,String> options, IteratorEnvironment env) throws IOException {
- super.init(source, options, env);
- setEncoder(UID_LIST_ENCODER);
- }
-
- @Override
- public Uid.List typedReduce(Key key, Iterator<Uid.List> iter) {
- Uid.List.Builder builder = Uid.List.newBuilder();
- HashSet<String> uids = new HashSet<String>();
- boolean seenIgnore = false;
- long count = 0;
- while (iter.hasNext()) {
- Uid.List v = iter.next();
- if (null == v)
- continue;
- count = count + v.getCOUNT();
- if (v.getIGNORE()) {
- seenIgnore = true;
- }
- uids.addAll(v.getUIDList());
- }
- // Special case logic
- // If we have aggregated more than MAX UIDs, then null out the UID list and set IGNORE to true
- // However, always maintain the count
- builder.setCOUNT(count);
- if (uids.size() > MAX || seenIgnore) {
- builder.setIGNORE(true);
- builder.clearUID();
- } else {
- builder.setIGNORE(false);
- builder.addAllUID(uids);
- }
- return builder.build();
- }
-
- public static class UidListEncoder implements Encoder<Uid.List> {
- @Override
- public byte[] encode(Uid.List v) {
- return v.toByteArray();
- }
-
- @Override
- public Uid.List decode(byte[] b) {
- if (b.length == 0)
- return null;
- try {
- return Uid.List.parseFrom(b);
- } catch (InvalidProtocolBufferException e) {
- throw new ValueFormatException("Value passed to aggregator was not of type Uid.List");
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/TextIndexCombiner.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/TextIndexCombiner.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/TextIndexCombiner.java
deleted file mode 100644
index 85f3e1e..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/iterator/TextIndexCombiner.java
+++ /dev/null
@@ -1,102 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.iterator;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-import org.apache.accumulo.core.data.Key;
-import org.apache.accumulo.core.data.Value;
-import org.apache.accumulo.core.iterators.IteratorEnvironment;
-import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
-import org.apache.accumulo.core.iterators.TypedValueCombiner;
-import org.apache.accumulo.core.iterators.ValueFormatException;
-import org.apache.accumulo.examples.wikisearch.protobuf.TermWeight;
-
-import com.google.protobuf.InvalidProtocolBufferException;
-
-/**
- *
- */
-public class TextIndexCombiner extends TypedValueCombiner<TermWeight.Info> {
- public static final Encoder<TermWeight.Info> TERMWEIGHT_INFO_ENCODER = new TermWeightInfoEncoder();
-
- @Override
- public TermWeight.Info typedReduce(Key key, Iterator<TermWeight.Info> iter) {
- TermWeight.Info.Builder builder = TermWeight.Info.newBuilder();
- List<Integer> offsets = new ArrayList<Integer>();
- float normalizedTermFrequency = 0f;
-
- while (iter.hasNext()) {
- TermWeight.Info info = iter.next();
- if (null == info)
- continue;
-
- // Add each offset into the list maintaining sorted order
- for (int offset : info.getWordOffsetList()) {
- int pos = Collections.binarySearch(offsets, offset);
-
- if (pos < 0) {
- // Undo the transform on the insertion point
- offsets.add((-1 * pos) - 1, offset);
- } else {
- offsets.add(pos, offset);
- }
- }
-
- if (info.getNormalizedTermFrequency() > 0) {
- normalizedTermFrequency += info.getNormalizedTermFrequency();
- }
- }
-
- // Keep the sorted order we tried to maintain
- for (int i = 0; i < offsets.size(); ++i) {
- builder.addWordOffset(offsets.get(i));
- }
-
- builder.setNormalizedTermFrequency(normalizedTermFrequency);
- return builder.build();
- }
-
- @Override
- public void init(SortedKeyValueIterator<Key,Value> source, Map<String,String> options, IteratorEnvironment env) throws IOException {
- super.init(source, options, env);
- setEncoder(TERMWEIGHT_INFO_ENCODER);
- }
-
- public static class TermWeightInfoEncoder implements Encoder<TermWeight.Info> {
- @Override
- public byte[] encode(TermWeight.Info v) {
- return v.toByteArray();
- }
-
- @Override
- public TermWeight.Info decode(byte[] b) {
- if (b.length == 0)
- return null;
- try {
- return TermWeight.Info.parseFrom(b);
- } catch (InvalidProtocolBufferException e) {
- throw new ValueFormatException("Value passed to aggregator was not of type TermWeight.Info");
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/LcNoDiacriticsNormalizer.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/LcNoDiacriticsNormalizer.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/LcNoDiacriticsNormalizer.java
deleted file mode 100644
index b15a5ee..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/LcNoDiacriticsNormalizer.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.normalizer;
-
-import java.text.Normalizer;
-import java.text.Normalizer.Form;
-import java.util.Locale;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * An {@link Normalizer} which performs the following steps:
- * <ol>
- * <li>Unicode canonical decomposition ({@link Form#NFD})</li>
- * <li>Removal of diacritical marks</li>
- * <li>Unicode canonical composition ({@link Form#NFC})</li>
- * <li>lower casing in the {@link Locale#ENGLISH English local}
- * </ol>
- */
-public class LcNoDiacriticsNormalizer implements org.apache.accumulo.examples.wikisearch.normalizer.Normalizer {
- private static final Pattern diacriticals = Pattern.compile("\\p{InCombiningDiacriticalMarks}");
-
- public String normalizeFieldValue(String fieldName, Object fieldValue) {
- String decomposed = Normalizer.normalize(fieldValue.toString(), Form.NFD);
- String noDiacriticals = removeDiacriticalMarks(decomposed);
- String recomposed = Normalizer.normalize(noDiacriticals, Form.NFC);
- return recomposed.toLowerCase(Locale.ENGLISH);
- }
-
- private String removeDiacriticalMarks(String str) {
- Matcher matcher = diacriticals.matcher(str);
- return matcher.replaceAll("");
- }
-
-}
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NoOpNormalizer.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NoOpNormalizer.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NoOpNormalizer.java
deleted file mode 100644
index 7498f76..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NoOpNormalizer.java
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.normalizer;
-
-public class NoOpNormalizer implements Normalizer {
- public String normalizeFieldValue(String field, Object value) {
- return value.toString();
- }
-}
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/Normalizer.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/Normalizer.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/Normalizer.java
deleted file mode 100644
index ba3632a..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/Normalizer.java
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.normalizer;
-
-public interface Normalizer {
-
- /**
- * Creates normalized content for ingest based upon implemented logic.
- *
- * @param field
- * The field being normalized
- * @param value
- * The value to normalize
- * @return a normalized value
- */
- public String normalizeFieldValue(String field, Object value);
-
-}
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java
deleted file mode 100644
index e0a5cc8..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/normalizer/NumberNormalizer.java
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.normalizer;
-
-import org.apache.commons.lang.math.NumberUtils;
-import org.apache.lucene.util.NumericUtils;
-
-public class NumberNormalizer implements Normalizer {
-
- public String normalizeFieldValue(String field, Object value) {
- if (NumberUtils.isNumber(value.toString())) {
- Number n = NumberUtils.createNumber(value.toString());
- if (n instanceof Integer)
- return NumericUtils.intToPrefixCoded((Integer) n);
- else if (n instanceof Long)
- return NumericUtils.longToPrefixCoded((Long) n);
- else if (n instanceof Float)
- return NumericUtils.floatToPrefixCoded((Float) n);
- else if (n instanceof Double)
- return NumericUtils.doubleToPrefixCoded((Double) n);
- else
- throw new IllegalArgumentException("Unhandled numeric type: " + n.getClass());
- } else {
- throw new IllegalArgumentException("Value is not a number: " + value);
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/output/BufferingRFileRecordWriter.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/output/BufferingRFileRecordWriter.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/output/BufferingRFileRecordWriter.java
deleted file mode 100644
index 9b663de..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/output/BufferingRFileRecordWriter.java
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.output;
-
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Map.Entry;
-import java.util.TreeMap;
-
-import org.apache.accumulo.core.conf.AccumuloConfiguration;
-import org.apache.accumulo.core.data.ColumnUpdate;
-import org.apache.accumulo.core.data.Key;
-import org.apache.accumulo.core.data.Mutation;
-import org.apache.accumulo.core.data.Value;
-import org.apache.accumulo.core.file.FileSKVWriter;
-import org.apache.accumulo.core.file.rfile.RFileOperations;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.RecordWriter;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-
-final class BufferingRFileRecordWriter extends RecordWriter<Text,Mutation> {
- private final long maxSize;
- private final AccumuloConfiguration acuconf;
- private final Configuration conf;
- private final String filenamePrefix;
- private final String taskID;
- private final FileSystem fs;
- private int fileCount = 0;
- private long size;
-
- private Map<Text,TreeMap<Key,Value>> buffers = new HashMap<Text,TreeMap<Key,Value>>();
- private Map<Text,Long> bufferSizes = new HashMap<Text,Long>();
-
- private TreeMap<Key,Value> getBuffer(Text tablename) {
- TreeMap<Key,Value> buffer = buffers.get(tablename);
- if (buffer == null) {
- buffer = new TreeMap<Key,Value>();
- buffers.put(tablename, buffer);
- bufferSizes.put(tablename, 0l);
- }
- return buffer;
- }
-
- private Text getLargestTablename() {
- long max = 0;
- Text table = null;
- for (Entry<Text,Long> e : bufferSizes.entrySet()) {
- if (e.getValue() > max) {
- max = e.getValue();
- table = e.getKey();
- }
- }
- return table;
- }
-
- private void flushLargestTable() throws IOException {
- Text tablename = getLargestTablename();
- if (tablename == null)
- return;
- long bufferSize = bufferSizes.get(tablename);
- TreeMap<Key,Value> buffer = buffers.get(tablename);
- if (buffer.size() == 0)
- return;
-
- String file = filenamePrefix + "/" + tablename + "/" + taskID + "_" + (fileCount++) + ".rf";
- // TODO get the table configuration for the given table?
- FileSKVWriter writer = RFileOperations.getInstance().openWriter(file, fs, conf, acuconf);
-
- // forget locality groups for now, just write everything to the default
- writer.startDefaultLocalityGroup();
-
- for (Entry<Key,Value> e : buffer.entrySet()) {
- writer.append(e.getKey(), e.getValue());
- }
-
- writer.close();
-
- size -= bufferSize;
- buffer.clear();
- bufferSizes.put(tablename, 0l);
- }
-
- BufferingRFileRecordWriter(long maxSize, AccumuloConfiguration acuconf, Configuration conf, String filenamePrefix, String taskID, FileSystem fs) {
- this.maxSize = maxSize;
- this.acuconf = acuconf;
- this.conf = conf;
- this.filenamePrefix = filenamePrefix;
- this.taskID = taskID;
- this.fs = fs;
- }
-
- @Override
- public void close(TaskAttemptContext arg0) throws IOException, InterruptedException {
- while (size > 0)
- flushLargestTable();
- }
-
- @Override
- public void write(Text table, Mutation mutation) throws IOException, InterruptedException {
- TreeMap<Key,Value> buffer = getBuffer(table);
- int mutationSize = 0;
- for (ColumnUpdate update : mutation.getUpdates()) {
- Key k = new Key(mutation.getRow(), update.getColumnFamily(), update.getColumnQualifier(), update.getColumnVisibility(), update.getTimestamp(),
- update.isDeleted());
- Value v = new Value(update.getValue());
- // TODO account for object overhead
- mutationSize += k.getSize();
- mutationSize += v.getSize();
- buffer.put(k, v);
- }
- size += mutationSize;
- long bufferSize = bufferSizes.get(table);
-
- // TODO use a MutableLong instead
- bufferSize += mutationSize;
- bufferSizes.put(table, bufferSize);
-
- while (size >= maxSize) {
- flushLargestTable();
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/output/SortingRFileOutputFormat.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/output/SortingRFileOutputFormat.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/output/SortingRFileOutputFormat.java
deleted file mode 100644
index 1fa8fdc..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/output/SortingRFileOutputFormat.java
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.output;
-
-import java.io.IOException;
-
-import org.apache.accumulo.core.conf.AccumuloConfiguration;
-import org.apache.accumulo.core.data.Mutation;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.JobContext;
-import org.apache.hadoop.mapreduce.OutputCommitter;
-import org.apache.hadoop.mapreduce.OutputFormat;
-import org.apache.hadoop.mapreduce.RecordWriter;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-
-public class SortingRFileOutputFormat extends OutputFormat<Text,Mutation> {
-
- // private static final Logger log = Logger.getLogger(SortingRFileOutputFormat.class);
-
- public static final String PATH_NAME = "sortingrfileoutputformat.path";
- public static final String MAX_BUFFER_SIZE = "sortingrfileoutputformat.max.buffer.size";
-
- public static void setPathName(Configuration conf, String path) {
- conf.set(PATH_NAME, path);
- }
-
- public static String getPathName(Configuration conf) {
- return conf.get(PATH_NAME);
- }
-
- public static void setMaxBufferSize(Configuration conf, long maxBufferSize) {
- conf.setLong(MAX_BUFFER_SIZE, maxBufferSize);
- }
-
- public static long getMaxBufferSize(Configuration conf) {
- return conf.getLong(MAX_BUFFER_SIZE, -1);
- }
-
- @Override
- public void checkOutputSpecs(JobContext job) throws IOException, InterruptedException {
- // TODO make sure the path is writable?
- // TODO make sure the max buffer size is set and is reasonable
- }
-
- @Override
- public OutputCommitter getOutputCommitter(TaskAttemptContext arg0) throws IOException, InterruptedException {
- return new OutputCommitter() {
-
- @Override
- public void setupTask(TaskAttemptContext arg0) throws IOException {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setupJob(JobContext arg0) throws IOException {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public boolean needsTaskCommit(TaskAttemptContext arg0) throws IOException {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public void commitTask(TaskAttemptContext arg0) throws IOException {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void cleanupJob(JobContext arg0) throws IOException {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void abortTask(TaskAttemptContext arg0) throws IOException {
- // TODO Auto-generated method stub
-
- }
- };
- }
-
- @Override
- public RecordWriter<Text,Mutation> getRecordWriter(TaskAttemptContext attempt) throws IOException, InterruptedException {
-
- // grab the configuration
- final Configuration conf = attempt.getConfiguration();
- // create a filename
- final String filenamePrefix = getPathName(conf);
- final String taskID = attempt.getTaskAttemptID().toString();
- // grab the max size
- final long maxSize = getMaxBufferSize(conf);
- // grab the FileSystem
- final FileSystem fs = FileSystem.get(conf);
- // create a default AccumuloConfiguration
- final AccumuloConfiguration acuconf = AccumuloConfiguration.getDefaultConfiguration();
-
- return new BufferingRFileRecordWriter(maxSize, acuconf, conf, filenamePrefix, taskID, fs);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/TermWeight.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/TermWeight.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/TermWeight.java
deleted file mode 100644
index bf5133f..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/protobuf/TermWeight.java
+++ /dev/null
@@ -1,424 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-// Generated by the protocol buffer compiler. DO NOT EDIT!
-// source: TermWeight.proto
-
-package org.apache.accumulo.examples.wikisearch.protobuf;
-
-public final class TermWeight {
- private TermWeight() {}
-
- public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {}
-
- public static final class Info extends com.google.protobuf.GeneratedMessage {
- // Use Info.newBuilder() to construct.
- private Info() {
- initFields();
- }
-
- private Info(boolean noInit) {}
-
- private static final Info defaultInstance;
-
- public static Info getDefaultInstance() {
- return defaultInstance;
- }
-
- public Info getDefaultInstanceForType() {
- return defaultInstance;
- }
-
- public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() {
- return org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.internal_static_protobuf_Info_descriptor;
- }
-
- protected com.google.protobuf.GeneratedMessage.FieldAccessorTable internalGetFieldAccessorTable() {
- return org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.internal_static_protobuf_Info_fieldAccessorTable;
- }
-
- // required float normalizedTermFrequency = 1;
- public static final int NORMALIZEDTERMFREQUENCY_FIELD_NUMBER = 1;
- private boolean hasNormalizedTermFrequency;
- private float normalizedTermFrequency_ = 0F;
-
- public boolean hasNormalizedTermFrequency() {
- return hasNormalizedTermFrequency;
- }
-
- public float getNormalizedTermFrequency() {
- return normalizedTermFrequency_;
- }
-
- // repeated uint32 wordOffset = 2;
- public static final int WORDOFFSET_FIELD_NUMBER = 2;
- private java.util.List<java.lang.Integer> wordOffset_ = java.util.Collections.emptyList();
-
- public java.util.List<java.lang.Integer> getWordOffsetList() {
- return wordOffset_;
- }
-
- public int getWordOffsetCount() {
- return wordOffset_.size();
- }
-
- public int getWordOffset(int index) {
- return wordOffset_.get(index);
- }
-
- private void initFields() {}
-
- public final boolean isInitialized() {
- if (!hasNormalizedTermFrequency)
- return false;
- return true;
- }
-
- public void writeTo(com.google.protobuf.CodedOutputStream output) throws java.io.IOException {
- getSerializedSize();
- if (hasNormalizedTermFrequency()) {
- output.writeFloat(1, getNormalizedTermFrequency());
- }
- for (int element : getWordOffsetList()) {
- output.writeUInt32(2, element);
- }
- getUnknownFields().writeTo(output);
- }
-
- private int memoizedSerializedSize = -1;
-
- public int getSerializedSize() {
- int size = memoizedSerializedSize;
- if (size != -1)
- return size;
-
- size = 0;
- if (hasNormalizedTermFrequency()) {
- size += com.google.protobuf.CodedOutputStream.computeFloatSize(1, getNormalizedTermFrequency());
- }
- {
- int dataSize = 0;
- for (int element : getWordOffsetList()) {
- dataSize += com.google.protobuf.CodedOutputStream.computeUInt32SizeNoTag(element);
- }
- size += dataSize;
- size += 1 * getWordOffsetList().size();
- }
- size += getUnknownFields().getSerializedSize();
- memoizedSerializedSize = size;
- return size;
- }
-
- public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(com.google.protobuf.ByteString data) throws com.google.protobuf.InvalidProtocolBufferException {
- return newBuilder().mergeFrom(data).buildParsed();
- }
-
- public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(com.google.protobuf.ByteString data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return newBuilder().mergeFrom(data, extensionRegistry).buildParsed();
- }
-
- public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(byte[] data) throws com.google.protobuf.InvalidProtocolBufferException {
- return newBuilder().mergeFrom(data).buildParsed();
- }
-
- public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(byte[] data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws com.google.protobuf.InvalidProtocolBufferException {
- return newBuilder().mergeFrom(data, extensionRegistry).buildParsed();
- }
-
- public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(java.io.InputStream input) throws java.io.IOException {
- return newBuilder().mergeFrom(input).buildParsed();
- }
-
- public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws java.io.IOException {
- return newBuilder().mergeFrom(input, extensionRegistry).buildParsed();
- }
-
- public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseDelimitedFrom(java.io.InputStream input) throws java.io.IOException {
- Builder builder = newBuilder();
- if (builder.mergeDelimitedFrom(input)) {
- return builder.buildParsed();
- } else {
- return null;
- }
- }
-
- public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseDelimitedFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws java.io.IOException {
- Builder builder = newBuilder();
- if (builder.mergeDelimitedFrom(input, extensionRegistry)) {
- return builder.buildParsed();
- } else {
- return null;
- }
- }
-
- public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(com.google.protobuf.CodedInputStream input) throws java.io.IOException {
- return newBuilder().mergeFrom(input).buildParsed();
- }
-
- public static org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info parseFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws java.io.IOException {
- return newBuilder().mergeFrom(input, extensionRegistry).buildParsed();
- }
-
- public static Builder newBuilder() {
- return Builder.create();
- }
-
- public Builder newBuilderForType() {
- return newBuilder();
- }
-
- public static Builder newBuilder(org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info prototype) {
- return newBuilder().mergeFrom(prototype);
- }
-
- public Builder toBuilder() {
- return newBuilder(this);
- }
-
- public static final class Builder extends com.google.protobuf.GeneratedMessage.Builder<Builder> {
- private org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info result;
-
- // Construct using protobuf.TermWeight.Info.newBuilder()
- private Builder() {}
-
- private static Builder create() {
- Builder builder = new Builder();
- builder.result = new org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info();
- return builder;
- }
-
- protected org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info internalGetResult() {
- return result;
- }
-
- public Builder clear() {
- if (result == null) {
- throw new IllegalStateException("Cannot call clear() after build().");
- }
- result = new org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info();
- return this;
- }
-
- public Builder clone() {
- return create().mergeFrom(result);
- }
-
- public com.google.protobuf.Descriptors.Descriptor getDescriptorForType() {
- return org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info.getDescriptor();
- }
-
- public org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info getDefaultInstanceForType() {
- return org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info.getDefaultInstance();
- }
-
- public boolean isInitialized() {
- return result.isInitialized();
- }
-
- public org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info build() {
- if (result != null && !isInitialized()) {
- throw newUninitializedMessageException(result);
- }
- return buildPartial();
- }
-
- private org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info buildParsed() throws com.google.protobuf.InvalidProtocolBufferException {
- if (!isInitialized()) {
- throw newUninitializedMessageException(result).asInvalidProtocolBufferException();
- }
- return buildPartial();
- }
-
- public org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info buildPartial() {
- if (result == null) {
- throw new IllegalStateException("build() has already been called on this Builder.");
- }
- if (result.wordOffset_ != java.util.Collections.EMPTY_LIST) {
- result.wordOffset_ = java.util.Collections.unmodifiableList(result.wordOffset_);
- }
- org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info returnMe = result;
- result = null;
- return returnMe;
- }
-
- public Builder mergeFrom(com.google.protobuf.Message other) {
- if (other instanceof org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info) {
- return mergeFrom((org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info) other);
- } else {
- super.mergeFrom(other);
- return this;
- }
- }
-
- public Builder mergeFrom(org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info other) {
- if (other == org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info.getDefaultInstance())
- return this;
- if (other.hasNormalizedTermFrequency()) {
- setNormalizedTermFrequency(other.getNormalizedTermFrequency());
- }
- if (!other.wordOffset_.isEmpty()) {
- if (result.wordOffset_.isEmpty()) {
- result.wordOffset_ = new java.util.ArrayList<java.lang.Integer>();
- }
- result.wordOffset_.addAll(other.wordOffset_);
- }
- this.mergeUnknownFields(other.getUnknownFields());
- return this;
- }
-
- public Builder mergeFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
- throws java.io.IOException {
- com.google.protobuf.UnknownFieldSet.Builder unknownFields = com.google.protobuf.UnknownFieldSet.newBuilder(this.getUnknownFields());
- while (true) {
- int tag = input.readTag();
- switch (tag) {
- case 0:
- this.setUnknownFields(unknownFields.build());
- return this;
- default: {
- if (!parseUnknownField(input, unknownFields, extensionRegistry, tag)) {
- this.setUnknownFields(unknownFields.build());
- return this;
- }
- break;
- }
- case 13: {
- setNormalizedTermFrequency(input.readFloat());
- break;
- }
- case 16: {
- addWordOffset(input.readUInt32());
- break;
- }
- case 18: {
- int length = input.readRawVarint32();
- int limit = input.pushLimit(length);
- while (input.getBytesUntilLimit() > 0) {
- addWordOffset(input.readUInt32());
- }
- input.popLimit(limit);
- break;
- }
- }
- }
- }
-
- // required float normalizedTermFrequency = 1;
- public boolean hasNormalizedTermFrequency() {
- return result.hasNormalizedTermFrequency();
- }
-
- public float getNormalizedTermFrequency() {
- return result.getNormalizedTermFrequency();
- }
-
- public Builder setNormalizedTermFrequency(float value) {
- result.hasNormalizedTermFrequency = true;
- result.normalizedTermFrequency_ = value;
- return this;
- }
-
- public Builder clearNormalizedTermFrequency() {
- result.hasNormalizedTermFrequency = false;
- result.normalizedTermFrequency_ = 0F;
- return this;
- }
-
- // repeated uint32 wordOffset = 2;
- public java.util.List<java.lang.Integer> getWordOffsetList() {
- return java.util.Collections.unmodifiableList(result.wordOffset_);
- }
-
- public int getWordOffsetCount() {
- return result.getWordOffsetCount();
- }
-
- public int getWordOffset(int index) {
- return result.getWordOffset(index);
- }
-
- public Builder setWordOffset(int index, int value) {
- result.wordOffset_.set(index, value);
- return this;
- }
-
- public Builder addWordOffset(int value) {
- if (result.wordOffset_.isEmpty()) {
- result.wordOffset_ = new java.util.ArrayList<java.lang.Integer>();
- }
- result.wordOffset_.add(value);
- return this;
- }
-
- public Builder addAllWordOffset(java.lang.Iterable<? extends java.lang.Integer> values) {
- if (result.wordOffset_.isEmpty()) {
- result.wordOffset_ = new java.util.ArrayList<java.lang.Integer>();
- }
- super.addAll(values, result.wordOffset_);
- return this;
- }
-
- public Builder clearWordOffset() {
- result.wordOffset_ = java.util.Collections.emptyList();
- return this;
- }
-
- // @@protoc_insertion_point(builder_scope:protobuf.Info)
- }
-
- static {
- defaultInstance = new Info(true);
- org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.internalForceInit();
- defaultInstance.initFields();
- }
-
- // @@protoc_insertion_point(class_scope:protobuf.Info)
- }
-
- private static com.google.protobuf.Descriptors.Descriptor internal_static_protobuf_Info_descriptor;
- private static com.google.protobuf.GeneratedMessage.FieldAccessorTable internal_static_protobuf_Info_fieldAccessorTable;
-
- public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
- return descriptor;
- }
-
- private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
- static {
- java.lang.String[] descriptorData = {"\n\020TermWeight.proto\022\010protobuf\";\n\004Info\022\037\n\027"
- + "normalizedTermFrequency\030\001 \002(\002\022\022\n\nwordOff" + "set\030\002 \003(\rB\014\n\010protobufH\001"};
- com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner = new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() {
- public com.google.protobuf.ExtensionRegistry assignDescriptors(com.google.protobuf.Descriptors.FileDescriptor root) {
- descriptor = root;
- internal_static_protobuf_Info_descriptor = getDescriptor().getMessageTypes().get(0);
- internal_static_protobuf_Info_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable(
- internal_static_protobuf_Info_descriptor, new java.lang.String[] {"NormalizedTermFrequency", "WordOffset",}, org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info.class,
- org.apache.accumulo.examples.wikisearch.protobuf.TermWeight.Info.Builder.class);
- return null;
- }
- };
- com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[] {},
- assigner);
- }
-
- public static void internalForceInit() {}
-
- // @@protoc_insertion_point(outer_class_scope)
-}