You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@accumulo.apache.org by ec...@apache.org on 2012/01/06 23:02:13 UTC
svn commit: r1228459 [2/13] - in /incubator/accumulo/branches/1.4: ./
contrib/accumulo_sample/
src/examples/src/main/java/org/apache/accumulo/examples/wikisearch/
src/trace/ src/wikisearch/ src/wikisearch/ingest/
src/wikisearch/ingest/bin/ src/wikisear...
Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/iterator/TotalAggregatingIterator.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/iterator/TotalAggregatingIterator.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/iterator/TotalAggregatingIterator.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/iterator/TotalAggregatingIterator.java Fri Jan 6 22:02:09 2012
@@ -0,0 +1,204 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.wikisearch.iterator;
+
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.accumulo.core.data.ByteSequence;
+import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.PartialKey;
+import org.apache.accumulo.core.data.Range;
+import org.apache.accumulo.core.data.Value;
+import org.apache.accumulo.core.iterators.IteratorEnvironment;
+import org.apache.accumulo.core.iterators.OptionDescriber;
+import org.apache.accumulo.core.iterators.SortedKeyValueIterator;
+import org.apache.accumulo.core.iterators.aggregation.Aggregator;
+import org.apache.accumulo.core.iterators.conf.ColumnToClassMapping;
+import org.apache.accumulo.start.classloader.AccumuloClassLoader;
+
+/**
+ * Aggregate all values with the same key (row, colf, colq, colVis.).
+ *
+ */
+
+public class TotalAggregatingIterator implements SortedKeyValueIterator<Key,Value>, OptionDescriber {
+
+ private SortedKeyValueIterator<Key,Value> iterator;
+
+ private Key workKey = new Key();
+
+ private Key aggrKey;
+ private Value aggrValue;
+
+ private Aggregator agg;
+
+ public TotalAggregatingIterator deepCopy(IteratorEnvironment env) {
+ return new TotalAggregatingIterator(this, env);
+ }
+
+ private TotalAggregatingIterator(TotalAggregatingIterator other, IteratorEnvironment env) {
+ iterator = other.iterator.deepCopy(env);
+ agg = other.agg;
+ }
+
+ public TotalAggregatingIterator() {}
+
+ private void aggregateRowColumn(Aggregator aggr) throws IOException {
+ // this function assumes that first value is not delete
+
+ workKey.set(iterator.getTopKey());
+
+ Key keyToAggregate = workKey;
+
+ aggr.reset();
+
+ aggr.collect(iterator.getTopValue());
+ iterator.next();
+
+ while (iterator.hasTop() && iterator.getTopKey().equals(keyToAggregate, PartialKey.ROW_COLFAM_COLQUAL_COLVIS)) {
+ aggr.collect(iterator.getTopValue());
+ iterator.next();
+ }
+
+ aggrKey = workKey;
+ aggrValue = aggr.aggregate();
+
+ }
+
+ private void findTop() throws IOException {
+ // check if aggregation is needed
+ if (iterator.hasTop()) {
+ aggregateRowColumn(agg);
+ }
+ }
+
+ public TotalAggregatingIterator(SortedKeyValueIterator<Key,Value> iterator, ColumnToClassMapping<Aggregator> aggregators) throws IOException {
+ this.iterator = iterator;
+ }
+
+ @Override
+ public Key getTopKey() {
+ if (aggrKey != null) {
+ return aggrKey;
+ }
+ return iterator.getTopKey();
+ }
+
+ @Override
+ public Value getTopValue() {
+ if (aggrKey != null) {
+ return aggrValue;
+ }
+ return iterator.getTopValue();
+ }
+
+ @Override
+ public boolean hasTop() {
+ return aggrKey != null || iterator.hasTop();
+ }
+
+ @Override
+ public void next() throws IOException {
+ if (aggrKey != null) {
+ aggrKey = null;
+ aggrValue = null;
+ } else {
+ iterator.next();
+ }
+
+ findTop();
+ }
+
+ @Override
+ public void seek(Range range, Collection<ByteSequence> columnFamilies, boolean inclusive) throws IOException {
+ // do not want to seek to the middle of a value that should be
+ // aggregated...
+
+ Range seekRange = maximizeStartKeyTimeStamp(range);
+
+ iterator.seek(seekRange, columnFamilies, inclusive);
+ findTop();
+
+ if (range.getStartKey() != null) {
+ while (hasTop() && getTopKey().equals(range.getStartKey(), PartialKey.ROW_COLFAM_COLQUAL_COLVIS)
+ && getTopKey().getTimestamp() > range.getStartKey().getTimestamp()) {
+ // the value has a more recent time stamp, so
+ // pass it up
+ // log.debug("skipping "+getTopKey());
+ next();
+ }
+
+ while (hasTop() && range.beforeStartKey(getTopKey())) {
+ next();
+ }
+ }
+
+ }
+
+ @Override
+ public void init(SortedKeyValueIterator<Key,Value> source, Map<String,String> options, IteratorEnvironment env) throws IOException {
+ agg = createAggregator(options);
+ this.iterator = source;
+ }
+
+ @Override
+ public IteratorOptions describeOptions() {
+ return new IteratorOptions("agg", "Aggregators apply aggregating functions to values with identical keys", null,
+ Collections.singletonList("* <aggregatorClass>"));
+ }
+
+ @Override
+ public boolean validateOptions(Map<String,String> options) {
+ if (options.size() > 1)
+ throw new IllegalArgumentException("This iterator only accepts one configuration option, the name of the aggregating class");
+ agg = createAggregator(options);
+ return true;
+ }
+
+ private Aggregator createAggregator(Map<String,String> options) {
+ Aggregator a = null;
+ for (Entry<String,String> entry : options.entrySet()) {
+ try {
+ Class<? extends Aggregator> clazz = AccumuloClassLoader.loadClass(entry.getValue(), Aggregator.class);
+ a = clazz.newInstance();
+ } catch (ClassNotFoundException e) {
+ throw new IllegalArgumentException("class not found: " + entry.getValue());
+ } catch (InstantiationException e) {
+ throw new IllegalArgumentException("instantiation exception: " + entry.getValue());
+ } catch (IllegalAccessException e) {
+ throw new IllegalArgumentException("illegal access exception: " + entry.getValue());
+ }
+ }
+ return a;
+ }
+
+ static Range maximizeStartKeyTimeStamp(Range range) {
+ Range seekRange = range;
+
+ if (range.getStartKey() != null && range.getStartKey().getTimestamp() != Long.MAX_VALUE) {
+ Key seekKey = new Key(seekRange.getStartKey());
+ seekKey.setTimestamp(Long.MAX_VALUE);
+ seekRange = new Range(seekKey, true, range.getEndKey(), range.isEndKeyInclusive());
+ }
+
+ return seekRange;
+ }
+}
Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/iterator/TotalAggregatingIterator.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/LcNoDiacriticsNormalizer.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/LcNoDiacriticsNormalizer.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/LcNoDiacriticsNormalizer.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/LcNoDiacriticsNormalizer.java Fri Jan 6 22:02:09 2012
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.wikisearch.normalizer;
+
+import java.text.Normalizer;
+import java.text.Normalizer.Form;
+import java.util.Locale;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * An {@link Normalizer} which performs the following steps:
+ * <ol>
+ * <li>Unicode canonical decomposition ({@link Form#NFD})</li>
+ * <li>Removal of diacritical marks</li>
+ * <li>Unicode canonical composition ({@link Form#NFC})</li>
+ * <li>lower casing in the {@link Locale#ENGLISH English local}
+ * </ol>
+ */
+public class LcNoDiacriticsNormalizer implements org.apache.accumulo.wikisearch.normalizer.Normalizer {
+ private static final Pattern diacriticals = Pattern.compile("\\p{InCombiningDiacriticalMarks}");
+
+ public String normalizeFieldValue(String fieldName, Object fieldValue) {
+ String decomposed = Normalizer.normalize(fieldValue.toString(), Form.NFD);
+ String noDiacriticals = removeDiacriticalMarks(decomposed);
+ String recomposed = Normalizer.normalize(noDiacriticals, Form.NFC);
+ return recomposed.toLowerCase(Locale.ENGLISH);
+ }
+
+ private String removeDiacriticalMarks(String str) {
+ Matcher matcher = diacriticals.matcher(str);
+ return matcher.replaceAll("");
+ }
+
+}
Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/LcNoDiacriticsNormalizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/NoOpNormalizer.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/NoOpNormalizer.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/NoOpNormalizer.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/NoOpNormalizer.java Fri Jan 6 22:02:09 2012
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.wikisearch.normalizer;
+
+public class NoOpNormalizer implements Normalizer {
+ public String normalizeFieldValue(String field, Object value) {
+ return value.toString();
+ }
+}
Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/NoOpNormalizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/Normalizer.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/Normalizer.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/Normalizer.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/Normalizer.java Fri Jan 6 22:02:09 2012
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.wikisearch.normalizer;
+
+public interface Normalizer {
+
+ /**
+ * Creates normalized content for ingest based upon implemented logic.
+ *
+ * @param field
+ * The field being normalized
+ * @param value
+ * The value to normalize
+ * @return a normalized value
+ */
+ public String normalizeFieldValue(String field, Object value);
+
+}
Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/Normalizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/NumberNormalizer.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/NumberNormalizer.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/NumberNormalizer.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/NumberNormalizer.java Fri Jan 6 22:02:09 2012
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.wikisearch.normalizer;
+
+import org.apache.commons.lang.math.NumberUtils;
+import org.apache.lucene.util.NumericUtils;
+
+public class NumberNormalizer implements Normalizer {
+
+ public String normalizeFieldValue(String field, Object value) {
+ if (NumberUtils.isNumber(value.toString())) {
+ Number n = NumberUtils.createNumber(value.toString());
+ if (n instanceof Integer)
+ return NumericUtils.intToPrefixCoded((Integer) n);
+ else if (n instanceof Long)
+ return NumericUtils.longToPrefixCoded((Long) n);
+ else if (n instanceof Float)
+ return NumericUtils.floatToPrefixCoded((Float) n);
+ else if (n instanceof Double)
+ return NumericUtils.doubleToPrefixCoded((Double) n);
+ else
+ throw new IllegalArgumentException("Unhandled numeric type: " + n.getClass());
+ } else {
+ throw new IllegalArgumentException("Value is not a number: " + value);
+ }
+ }
+
+}
Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/normalizer/NumberNormalizer.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/protobuf/TermWeight.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/protobuf/TermWeight.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/protobuf/TermWeight.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/protobuf/TermWeight.java Fri Jan 6 22:02:09 2012
@@ -0,0 +1,424 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Generated by the protocol buffer compiler. DO NOT EDIT!
+// source: TermWeight.proto
+
+package org.apache.accumulo.wikisearch.protobuf;
+
+public final class TermWeight {
+ private TermWeight() {}
+
+ public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {}
+
+ public static final class Info extends com.google.protobuf.GeneratedMessage {
+ // Use Info.newBuilder() to construct.
+ private Info() {
+ initFields();
+ }
+
+ private Info(boolean noInit) {}
+
+ private static final Info defaultInstance;
+
+ public static Info getDefaultInstance() {
+ return defaultInstance;
+ }
+
+ public Info getDefaultInstanceForType() {
+ return defaultInstance;
+ }
+
+ public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() {
+ return org.apache.accumulo.wikisearch.protobuf.TermWeight.internal_static_protobuf_Info_descriptor;
+ }
+
+ protected com.google.protobuf.GeneratedMessage.FieldAccessorTable internalGetFieldAccessorTable() {
+ return org.apache.accumulo.wikisearch.protobuf.TermWeight.internal_static_protobuf_Info_fieldAccessorTable;
+ }
+
+ // required float normalizedTermFrequency = 1;
+ public static final int NORMALIZEDTERMFREQUENCY_FIELD_NUMBER = 1;
+ private boolean hasNormalizedTermFrequency;
+ private float normalizedTermFrequency_ = 0F;
+
+ public boolean hasNormalizedTermFrequency() {
+ return hasNormalizedTermFrequency;
+ }
+
+ public float getNormalizedTermFrequency() {
+ return normalizedTermFrequency_;
+ }
+
+ // repeated uint32 wordOffset = 2;
+ public static final int WORDOFFSET_FIELD_NUMBER = 2;
+ private java.util.List<java.lang.Integer> wordOffset_ = java.util.Collections.emptyList();
+
+ public java.util.List<java.lang.Integer> getWordOffsetList() {
+ return wordOffset_;
+ }
+
+ public int getWordOffsetCount() {
+ return wordOffset_.size();
+ }
+
+ public int getWordOffset(int index) {
+ return wordOffset_.get(index);
+ }
+
+ private void initFields() {}
+
+ public final boolean isInitialized() {
+ if (!hasNormalizedTermFrequency)
+ return false;
+ return true;
+ }
+
+ public void writeTo(com.google.protobuf.CodedOutputStream output) throws java.io.IOException {
+ getSerializedSize();
+ if (hasNormalizedTermFrequency()) {
+ output.writeFloat(1, getNormalizedTermFrequency());
+ }
+ for (int element : getWordOffsetList()) {
+ output.writeUInt32(2, element);
+ }
+ getUnknownFields().writeTo(output);
+ }
+
+ private int memoizedSerializedSize = -1;
+
+ public int getSerializedSize() {
+ int size = memoizedSerializedSize;
+ if (size != -1)
+ return size;
+
+ size = 0;
+ if (hasNormalizedTermFrequency()) {
+ size += com.google.protobuf.CodedOutputStream.computeFloatSize(1, getNormalizedTermFrequency());
+ }
+ {
+ int dataSize = 0;
+ for (int element : getWordOffsetList()) {
+ dataSize += com.google.protobuf.CodedOutputStream.computeUInt32SizeNoTag(element);
+ }
+ size += dataSize;
+ size += 1 * getWordOffsetList().size();
+ }
+ size += getUnknownFields().getSerializedSize();
+ memoizedSerializedSize = size;
+ return size;
+ }
+
+ public static org.apache.accumulo.wikisearch.protobuf.TermWeight.Info parseFrom(com.google.protobuf.ByteString data) throws com.google.protobuf.InvalidProtocolBufferException {
+ return newBuilder().mergeFrom(data).buildParsed();
+ }
+
+ public static org.apache.accumulo.wikisearch.protobuf.TermWeight.Info parseFrom(com.google.protobuf.ByteString data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return newBuilder().mergeFrom(data, extensionRegistry).buildParsed();
+ }
+
+ public static org.apache.accumulo.wikisearch.protobuf.TermWeight.Info parseFrom(byte[] data) throws com.google.protobuf.InvalidProtocolBufferException {
+ return newBuilder().mergeFrom(data).buildParsed();
+ }
+
+ public static org.apache.accumulo.wikisearch.protobuf.TermWeight.Info parseFrom(byte[] data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return newBuilder().mergeFrom(data, extensionRegistry).buildParsed();
+ }
+
+ public static org.apache.accumulo.wikisearch.protobuf.TermWeight.Info parseFrom(java.io.InputStream input) throws java.io.IOException {
+ return newBuilder().mergeFrom(input).buildParsed();
+ }
+
+ public static org.apache.accumulo.wikisearch.protobuf.TermWeight.Info parseFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ return newBuilder().mergeFrom(input, extensionRegistry).buildParsed();
+ }
+
+ public static org.apache.accumulo.wikisearch.protobuf.TermWeight.Info parseDelimitedFrom(java.io.InputStream input) throws java.io.IOException {
+ Builder builder = newBuilder();
+ if (builder.mergeDelimitedFrom(input)) {
+ return builder.buildParsed();
+ } else {
+ return null;
+ }
+ }
+
+ public static org.apache.accumulo.wikisearch.protobuf.TermWeight.Info parseDelimitedFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ Builder builder = newBuilder();
+ if (builder.mergeDelimitedFrom(input, extensionRegistry)) {
+ return builder.buildParsed();
+ } else {
+ return null;
+ }
+ }
+
+ public static org.apache.accumulo.wikisearch.protobuf.TermWeight.Info parseFrom(com.google.protobuf.CodedInputStream input) throws java.io.IOException {
+ return newBuilder().mergeFrom(input).buildParsed();
+ }
+
+ public static org.apache.accumulo.wikisearch.protobuf.TermWeight.Info parseFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ return newBuilder().mergeFrom(input, extensionRegistry).buildParsed();
+ }
+
+ public static Builder newBuilder() {
+ return Builder.create();
+ }
+
+ public Builder newBuilderForType() {
+ return newBuilder();
+ }
+
+ public static Builder newBuilder(org.apache.accumulo.wikisearch.protobuf.TermWeight.Info prototype) {
+ return newBuilder().mergeFrom(prototype);
+ }
+
+ public Builder toBuilder() {
+ return newBuilder(this);
+ }
+
+ public static final class Builder extends com.google.protobuf.GeneratedMessage.Builder<Builder> {
+ private org.apache.accumulo.wikisearch.protobuf.TermWeight.Info result;
+
+ // Construct using protobuf.TermWeight.Info.newBuilder()
+ private Builder() {}
+
+ private static Builder create() {
+ Builder builder = new Builder();
+ builder.result = new org.apache.accumulo.wikisearch.protobuf.TermWeight.Info();
+ return builder;
+ }
+
+ protected org.apache.accumulo.wikisearch.protobuf.TermWeight.Info internalGetResult() {
+ return result;
+ }
+
+ public Builder clear() {
+ if (result == null) {
+ throw new IllegalStateException("Cannot call clear() after build().");
+ }
+ result = new org.apache.accumulo.wikisearch.protobuf.TermWeight.Info();
+ return this;
+ }
+
+ public Builder clone() {
+ return create().mergeFrom(result);
+ }
+
+ public com.google.protobuf.Descriptors.Descriptor getDescriptorForType() {
+ return org.apache.accumulo.wikisearch.protobuf.TermWeight.Info.getDescriptor();
+ }
+
+ public org.apache.accumulo.wikisearch.protobuf.TermWeight.Info getDefaultInstanceForType() {
+ return org.apache.accumulo.wikisearch.protobuf.TermWeight.Info.getDefaultInstance();
+ }
+
+ public boolean isInitialized() {
+ return result.isInitialized();
+ }
+
+ public org.apache.accumulo.wikisearch.protobuf.TermWeight.Info build() {
+ if (result != null && !isInitialized()) {
+ throw newUninitializedMessageException(result);
+ }
+ return buildPartial();
+ }
+
+ private org.apache.accumulo.wikisearch.protobuf.TermWeight.Info buildParsed() throws com.google.protobuf.InvalidProtocolBufferException {
+ if (!isInitialized()) {
+ throw newUninitializedMessageException(result).asInvalidProtocolBufferException();
+ }
+ return buildPartial();
+ }
+
+ public org.apache.accumulo.wikisearch.protobuf.TermWeight.Info buildPartial() {
+ if (result == null) {
+ throw new IllegalStateException("build() has already been called on this Builder.");
+ }
+ if (result.wordOffset_ != java.util.Collections.EMPTY_LIST) {
+ result.wordOffset_ = java.util.Collections.unmodifiableList(result.wordOffset_);
+ }
+ org.apache.accumulo.wikisearch.protobuf.TermWeight.Info returnMe = result;
+ result = null;
+ return returnMe;
+ }
+
+ public Builder mergeFrom(com.google.protobuf.Message other) {
+ if (other instanceof org.apache.accumulo.wikisearch.protobuf.TermWeight.Info) {
+ return mergeFrom((org.apache.accumulo.wikisearch.protobuf.TermWeight.Info) other);
+ } else {
+ super.mergeFrom(other);
+ return this;
+ }
+ }
+
+ public Builder mergeFrom(org.apache.accumulo.wikisearch.protobuf.TermWeight.Info other) {
+ if (other == org.apache.accumulo.wikisearch.protobuf.TermWeight.Info.getDefaultInstance())
+ return this;
+ if (other.hasNormalizedTermFrequency()) {
+ setNormalizedTermFrequency(other.getNormalizedTermFrequency());
+ }
+ if (!other.wordOffset_.isEmpty()) {
+ if (result.wordOffset_.isEmpty()) {
+ result.wordOffset_ = new java.util.ArrayList<java.lang.Integer>();
+ }
+ result.wordOffset_.addAll(other.wordOffset_);
+ }
+ this.mergeUnknownFields(other.getUnknownFields());
+ return this;
+ }
+
+ public Builder mergeFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ com.google.protobuf.UnknownFieldSet.Builder unknownFields = com.google.protobuf.UnknownFieldSet.newBuilder(this.getUnknownFields());
+ while (true) {
+ int tag = input.readTag();
+ switch (tag) {
+ case 0:
+ this.setUnknownFields(unknownFields.build());
+ return this;
+ default: {
+ if (!parseUnknownField(input, unknownFields, extensionRegistry, tag)) {
+ this.setUnknownFields(unknownFields.build());
+ return this;
+ }
+ break;
+ }
+ case 13: {
+ setNormalizedTermFrequency(input.readFloat());
+ break;
+ }
+ case 16: {
+ addWordOffset(input.readUInt32());
+ break;
+ }
+ case 18: {
+ int length = input.readRawVarint32();
+ int limit = input.pushLimit(length);
+ while (input.getBytesUntilLimit() > 0) {
+ addWordOffset(input.readUInt32());
+ }
+ input.popLimit(limit);
+ break;
+ }
+ }
+ }
+ }
+
+ // required float normalizedTermFrequency = 1;
+ public boolean hasNormalizedTermFrequency() {
+ return result.hasNormalizedTermFrequency();
+ }
+
+ public float getNormalizedTermFrequency() {
+ return result.getNormalizedTermFrequency();
+ }
+
+ public Builder setNormalizedTermFrequency(float value) {
+ result.hasNormalizedTermFrequency = true;
+ result.normalizedTermFrequency_ = value;
+ return this;
+ }
+
+ public Builder clearNormalizedTermFrequency() {
+ result.hasNormalizedTermFrequency = false;
+ result.normalizedTermFrequency_ = 0F;
+ return this;
+ }
+
+ // repeated uint32 wordOffset = 2;
+ public java.util.List<java.lang.Integer> getWordOffsetList() {
+ return java.util.Collections.unmodifiableList(result.wordOffset_);
+ }
+
+ public int getWordOffsetCount() {
+ return result.getWordOffsetCount();
+ }
+
+ public int getWordOffset(int index) {
+ return result.getWordOffset(index);
+ }
+
+ public Builder setWordOffset(int index, int value) {
+ result.wordOffset_.set(index, value);
+ return this;
+ }
+
+ public Builder addWordOffset(int value) {
+ if (result.wordOffset_.isEmpty()) {
+ result.wordOffset_ = new java.util.ArrayList<java.lang.Integer>();
+ }
+ result.wordOffset_.add(value);
+ return this;
+ }
+
+ public Builder addAllWordOffset(java.lang.Iterable<? extends java.lang.Integer> values) {
+ if (result.wordOffset_.isEmpty()) {
+ result.wordOffset_ = new java.util.ArrayList<java.lang.Integer>();
+ }
+ super.addAll(values, result.wordOffset_);
+ return this;
+ }
+
+ public Builder clearWordOffset() {
+ result.wordOffset_ = java.util.Collections.emptyList();
+ return this;
+ }
+
+ // @@protoc_insertion_point(builder_scope:protobuf.Info)
+ }
+
+ static {
+ defaultInstance = new Info(true);
+ org.apache.accumulo.wikisearch.protobuf.TermWeight.internalForceInit();
+ defaultInstance.initFields();
+ }
+
+ // @@protoc_insertion_point(class_scope:protobuf.Info)
+ }
+
+ private static com.google.protobuf.Descriptors.Descriptor internal_static_protobuf_Info_descriptor;
+ private static com.google.protobuf.GeneratedMessage.FieldAccessorTable internal_static_protobuf_Info_fieldAccessorTable;
+
+ public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
+ return descriptor;
+ }
+
+ private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
+ static {
+ java.lang.String[] descriptorData = {"\n\020TermWeight.proto\022\010protobuf\";\n\004Info\022\037\n\027"
+ + "normalizedTermFrequency\030\001 \002(\002\022\022\n\nwordOff" + "set\030\002 \003(\rB\014\n\010protobufH\001"};
+ com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner = new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() {
+ public com.google.protobuf.ExtensionRegistry assignDescriptors(com.google.protobuf.Descriptors.FileDescriptor root) {
+ descriptor = root;
+ internal_static_protobuf_Info_descriptor = getDescriptor().getMessageTypes().get(0);
+ internal_static_protobuf_Info_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable(
+ internal_static_protobuf_Info_descriptor, new java.lang.String[] {"NormalizedTermFrequency", "WordOffset",}, org.apache.accumulo.wikisearch.protobuf.TermWeight.Info.class,
+ org.apache.accumulo.wikisearch.protobuf.TermWeight.Info.Builder.class);
+ return null;
+ }
+ };
+ com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[] {},
+ assigner);
+ }
+
+ public static void internalForceInit() {}
+
+ // @@protoc_insertion_point(outer_class_scope)
+}
Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/protobuf/TermWeight.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/protobuf/Uid.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/protobuf/Uid.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/protobuf/Uid.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/protobuf/Uid.java Fri Jan 6 22:02:09 2012
@@ -0,0 +1,470 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Generated by the protocol buffer compiler. DO NOT EDIT!
+// source: Uid.proto
+
+package org.apache.accumulo.wikisearch.protobuf;
+
+public final class Uid {
+ private Uid() {}
+
+ public static void registerAllExtensions(com.google.protobuf.ExtensionRegistry registry) {}
+
+ public static final class List extends com.google.protobuf.GeneratedMessage {
+ // Use List.newBuilder() to construct.
+ private List() {
+ initFields();
+ }
+
+ private List(boolean noInit) {}
+
+ private static final List defaultInstance;
+
+ public static List getDefaultInstance() {
+ return defaultInstance;
+ }
+
+ public List getDefaultInstanceForType() {
+ return defaultInstance;
+ }
+
+ public static final com.google.protobuf.Descriptors.Descriptor getDescriptor() {
+ return org.apache.accumulo.wikisearch.protobuf.Uid.internal_static_protobuf_List_descriptor;
+ }
+
+ protected com.google.protobuf.GeneratedMessage.FieldAccessorTable internalGetFieldAccessorTable() {
+ return org.apache.accumulo.wikisearch.protobuf.Uid.internal_static_protobuf_List_fieldAccessorTable;
+ }
+
+ // required bool IGNORE = 1;
+ public static final int IGNORE_FIELD_NUMBER = 1;
+ private boolean hasIGNORE;
+ private boolean iGNORE_ = false;
+
+ public boolean hasIGNORE() {
+ return hasIGNORE;
+ }
+
+ public boolean getIGNORE() {
+ return iGNORE_;
+ }
+
+ // required uint64 COUNT = 2;
+ public static final int COUNT_FIELD_NUMBER = 2;
+ private boolean hasCOUNT;
+ private long cOUNT_ = 0L;
+
+ public boolean hasCOUNT() {
+ return hasCOUNT;
+ }
+
+ public long getCOUNT() {
+ return cOUNT_;
+ }
+
+ // repeated string UID = 3;
+ public static final int UID_FIELD_NUMBER = 3;
+ private java.util.List<java.lang.String> uID_ = java.util.Collections.emptyList();
+
+ public java.util.List<java.lang.String> getUIDList() {
+ return uID_;
+ }
+
+ public int getUIDCount() {
+ return uID_.size();
+ }
+
+ public java.lang.String getUID(int index) {
+ return uID_.get(index);
+ }
+
+ private void initFields() {}
+
+ public final boolean isInitialized() {
+ if (!hasIGNORE)
+ return false;
+ if (!hasCOUNT)
+ return false;
+ return true;
+ }
+
+ public void writeTo(com.google.protobuf.CodedOutputStream output) throws java.io.IOException {
+ getSerializedSize();
+ if (hasIGNORE()) {
+ output.writeBool(1, getIGNORE());
+ }
+ if (hasCOUNT()) {
+ output.writeUInt64(2, getCOUNT());
+ }
+ for (java.lang.String element : getUIDList()) {
+ output.writeString(3, element);
+ }
+ getUnknownFields().writeTo(output);
+ }
+
+ private int memoizedSerializedSize = -1;
+
+ public int getSerializedSize() {
+ int size = memoizedSerializedSize;
+ if (size != -1)
+ return size;
+
+ size = 0;
+ if (hasIGNORE()) {
+ size += com.google.protobuf.CodedOutputStream.computeBoolSize(1, getIGNORE());
+ }
+ if (hasCOUNT()) {
+ size += com.google.protobuf.CodedOutputStream.computeUInt64Size(2, getCOUNT());
+ }
+ {
+ int dataSize = 0;
+ for (java.lang.String element : getUIDList()) {
+ dataSize += com.google.protobuf.CodedOutputStream.computeStringSizeNoTag(element);
+ }
+ size += dataSize;
+ size += 1 * getUIDList().size();
+ }
+ size += getUnknownFields().getSerializedSize();
+ memoizedSerializedSize = size;
+ return size;
+ }
+
+ public static org.apache.accumulo.wikisearch.protobuf.Uid.List parseFrom(com.google.protobuf.ByteString data) throws com.google.protobuf.InvalidProtocolBufferException {
+ return newBuilder().mergeFrom(data).buildParsed();
+ }
+
+ public static org.apache.accumulo.wikisearch.protobuf.Uid.List parseFrom(com.google.protobuf.ByteString data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return newBuilder().mergeFrom(data, extensionRegistry).buildParsed();
+ }
+
+ public static org.apache.accumulo.wikisearch.protobuf.Uid.List parseFrom(byte[] data) throws com.google.protobuf.InvalidProtocolBufferException {
+ return newBuilder().mergeFrom(data).buildParsed();
+ }
+
+ public static org.apache.accumulo.wikisearch.protobuf.Uid.List parseFrom(byte[] data, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws com.google.protobuf.InvalidProtocolBufferException {
+ return newBuilder().mergeFrom(data, extensionRegistry).buildParsed();
+ }
+
+ public static org.apache.accumulo.wikisearch.protobuf.Uid.List parseFrom(java.io.InputStream input) throws java.io.IOException {
+ return newBuilder().mergeFrom(input).buildParsed();
+ }
+
+ public static org.apache.accumulo.wikisearch.protobuf.Uid.List parseFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ return newBuilder().mergeFrom(input, extensionRegistry).buildParsed();
+ }
+
+ public static org.apache.accumulo.wikisearch.protobuf.Uid.List parseDelimitedFrom(java.io.InputStream input) throws java.io.IOException {
+ Builder builder = newBuilder();
+ if (builder.mergeDelimitedFrom(input)) {
+ return builder.buildParsed();
+ } else {
+ return null;
+ }
+ }
+
+ public static org.apache.accumulo.wikisearch.protobuf.Uid.List parseDelimitedFrom(java.io.InputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ Builder builder = newBuilder();
+ if (builder.mergeDelimitedFrom(input, extensionRegistry)) {
+ return builder.buildParsed();
+ } else {
+ return null;
+ }
+ }
+
+ public static org.apache.accumulo.wikisearch.protobuf.Uid.List parseFrom(com.google.protobuf.CodedInputStream input) throws java.io.IOException {
+ return newBuilder().mergeFrom(input).buildParsed();
+ }
+
+ public static org.apache.accumulo.wikisearch.protobuf.Uid.List parseFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ return newBuilder().mergeFrom(input, extensionRegistry).buildParsed();
+ }
+
+ public static Builder newBuilder() {
+ return Builder.create();
+ }
+
+ public Builder newBuilderForType() {
+ return newBuilder();
+ }
+
+ public static Builder newBuilder(org.apache.accumulo.wikisearch.protobuf.Uid.List prototype) {
+ return newBuilder().mergeFrom(prototype);
+ }
+
+ public Builder toBuilder() {
+ return newBuilder(this);
+ }
+
+ public static final class Builder extends com.google.protobuf.GeneratedMessage.Builder<Builder> {
+ private org.apache.accumulo.wikisearch.protobuf.Uid.List result;
+
+ // Construct using protobuf.Uid.List.newBuilder()
+ private Builder() {}
+
+ private static Builder create() {
+ Builder builder = new Builder();
+ builder.result = new org.apache.accumulo.wikisearch.protobuf.Uid.List();
+ return builder;
+ }
+
+ protected org.apache.accumulo.wikisearch.protobuf.Uid.List internalGetResult() {
+ return result;
+ }
+
+ public Builder clear() {
+ if (result == null) {
+ throw new IllegalStateException("Cannot call clear() after build().");
+ }
+ result = new org.apache.accumulo.wikisearch.protobuf.Uid.List();
+ return this;
+ }
+
+ public Builder clone() {
+ return create().mergeFrom(result);
+ }
+
+ public com.google.protobuf.Descriptors.Descriptor getDescriptorForType() {
+ return org.apache.accumulo.wikisearch.protobuf.Uid.List.getDescriptor();
+ }
+
+ public org.apache.accumulo.wikisearch.protobuf.Uid.List getDefaultInstanceForType() {
+ return org.apache.accumulo.wikisearch.protobuf.Uid.List.getDefaultInstance();
+ }
+
+ public boolean isInitialized() {
+ return result.isInitialized();
+ }
+
+ public org.apache.accumulo.wikisearch.protobuf.Uid.List build() {
+ if (result != null && !isInitialized()) {
+ throw newUninitializedMessageException(result);
+ }
+ return buildPartial();
+ }
+
+ private org.apache.accumulo.wikisearch.protobuf.Uid.List buildParsed() throws com.google.protobuf.InvalidProtocolBufferException {
+ if (!isInitialized()) {
+ throw newUninitializedMessageException(result).asInvalidProtocolBufferException();
+ }
+ return buildPartial();
+ }
+
+ public org.apache.accumulo.wikisearch.protobuf.Uid.List buildPartial() {
+ if (result == null) {
+ throw new IllegalStateException("build() has already been called on this Builder.");
+ }
+ if (result.uID_ != java.util.Collections.EMPTY_LIST) {
+ result.uID_ = java.util.Collections.unmodifiableList(result.uID_);
+ }
+ org.apache.accumulo.wikisearch.protobuf.Uid.List returnMe = result;
+ result = null;
+ return returnMe;
+ }
+
+ public Builder mergeFrom(com.google.protobuf.Message other) {
+ if (other instanceof org.apache.accumulo.wikisearch.protobuf.Uid.List) {
+ return mergeFrom((org.apache.accumulo.wikisearch.protobuf.Uid.List) other);
+ } else {
+ super.mergeFrom(other);
+ return this;
+ }
+ }
+
+ public Builder mergeFrom(org.apache.accumulo.wikisearch.protobuf.Uid.List other) {
+ if (other == org.apache.accumulo.wikisearch.protobuf.Uid.List.getDefaultInstance())
+ return this;
+ if (other.hasIGNORE()) {
+ setIGNORE(other.getIGNORE());
+ }
+ if (other.hasCOUNT()) {
+ setCOUNT(other.getCOUNT());
+ }
+ if (!other.uID_.isEmpty()) {
+ if (result.uID_.isEmpty()) {
+ result.uID_ = new java.util.ArrayList<java.lang.String>();
+ }
+ result.uID_.addAll(other.uID_);
+ }
+ this.mergeUnknownFields(other.getUnknownFields());
+ return this;
+ }
+
+ public Builder mergeFrom(com.google.protobuf.CodedInputStream input, com.google.protobuf.ExtensionRegistryLite extensionRegistry)
+ throws java.io.IOException {
+ com.google.protobuf.UnknownFieldSet.Builder unknownFields = com.google.protobuf.UnknownFieldSet.newBuilder(this.getUnknownFields());
+ while (true) {
+ int tag = input.readTag();
+ switch (tag) {
+ case 0:
+ this.setUnknownFields(unknownFields.build());
+ return this;
+ default: {
+ if (!parseUnknownField(input, unknownFields, extensionRegistry, tag)) {
+ this.setUnknownFields(unknownFields.build());
+ return this;
+ }
+ break;
+ }
+ case 8: {
+ setIGNORE(input.readBool());
+ break;
+ }
+ case 16: {
+ setCOUNT(input.readUInt64());
+ break;
+ }
+ case 26: {
+ addUID(input.readString());
+ break;
+ }
+ }
+ }
+ }
+
+ // required bool IGNORE = 1;
+ public boolean hasIGNORE() {
+ return result.hasIGNORE();
+ }
+
+ public boolean getIGNORE() {
+ return result.getIGNORE();
+ }
+
+ public Builder setIGNORE(boolean value) {
+ result.hasIGNORE = true;
+ result.iGNORE_ = value;
+ return this;
+ }
+
+ public Builder clearIGNORE() {
+ result.hasIGNORE = false;
+ result.iGNORE_ = false;
+ return this;
+ }
+
+ // required uint64 COUNT = 2;
+ public boolean hasCOUNT() {
+ return result.hasCOUNT();
+ }
+
+ public long getCOUNT() {
+ return result.getCOUNT();
+ }
+
+ public Builder setCOUNT(long value) {
+ result.hasCOUNT = true;
+ result.cOUNT_ = value;
+ return this;
+ }
+
+ public Builder clearCOUNT() {
+ result.hasCOUNT = false;
+ result.cOUNT_ = 0L;
+ return this;
+ }
+
+ // repeated string UID = 3;
+ public java.util.List<java.lang.String> getUIDList() {
+ return java.util.Collections.unmodifiableList(result.uID_);
+ }
+
+ public int getUIDCount() {
+ return result.getUIDCount();
+ }
+
+ public java.lang.String getUID(int index) {
+ return result.getUID(index);
+ }
+
+ public Builder setUID(int index, java.lang.String value) {
+ if (value == null) {
+ throw new NullPointerException();
+ }
+ result.uID_.set(index, value);
+ return this;
+ }
+
+ public Builder addUID(java.lang.String value) {
+ if (value == null) {
+ throw new NullPointerException();
+ }
+ if (result.uID_.isEmpty()) {
+ result.uID_ = new java.util.ArrayList<java.lang.String>();
+ }
+ result.uID_.add(value);
+ return this;
+ }
+
+ public Builder addAllUID(java.lang.Iterable<? extends java.lang.String> values) {
+ if (result.uID_.isEmpty()) {
+ result.uID_ = new java.util.ArrayList<java.lang.String>();
+ }
+ super.addAll(values, result.uID_);
+ return this;
+ }
+
+ public Builder clearUID() {
+ result.uID_ = java.util.Collections.emptyList();
+ return this;
+ }
+
+ // @@protoc_insertion_point(builder_scope:protobuf.List)
+ }
+
+ static {
+ defaultInstance = new List(true);
+ org.apache.accumulo.wikisearch.protobuf.Uid.internalForceInit();
+ defaultInstance.initFields();
+ }
+
+ // @@protoc_insertion_point(class_scope:protobuf.List)
+ }
+
+ private static com.google.protobuf.Descriptors.Descriptor internal_static_protobuf_List_descriptor;
+ private static com.google.protobuf.GeneratedMessage.FieldAccessorTable internal_static_protobuf_List_fieldAccessorTable;
+
+ public static com.google.protobuf.Descriptors.FileDescriptor getDescriptor() {
+ return descriptor;
+ }
+
+ private static com.google.protobuf.Descriptors.FileDescriptor descriptor;
+ static {
+ java.lang.String[] descriptorData = {"\n\tUid.proto\022\010protobuf\"2\n\004List\022\016\n\006IGNORE\030"
+ + "\001 \002(\010\022\r\n\005COUNT\030\002 \002(\004\022\013\n\003UID\030\003 \003(\tB\014\n\010pro" + "tobufH\001"};
+ com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner assigner = new com.google.protobuf.Descriptors.FileDescriptor.InternalDescriptorAssigner() {
+ public com.google.protobuf.ExtensionRegistry assignDescriptors(com.google.protobuf.Descriptors.FileDescriptor root) {
+ descriptor = root;
+ internal_static_protobuf_List_descriptor = getDescriptor().getMessageTypes().get(0);
+ internal_static_protobuf_List_fieldAccessorTable = new com.google.protobuf.GeneratedMessage.FieldAccessorTable(
+ internal_static_protobuf_List_descriptor, new java.lang.String[] {"IGNORE", "COUNT", "UID",}, org.apache.accumulo.wikisearch.protobuf.Uid.List.class,
+ org.apache.accumulo.wikisearch.protobuf.Uid.List.Builder.class);
+ return null;
+ }
+ };
+ com.google.protobuf.Descriptors.FileDescriptor.internalBuildGeneratedFileFrom(descriptorData, new com.google.protobuf.Descriptors.FileDescriptor[] {},
+ assigner);
+ }
+
+ public static void internalForceInit() {}
+
+ // @@protoc_insertion_point(outer_class_scope)
+}
Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/protobuf/Uid.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/AggregatingRecordReader.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/AggregatingRecordReader.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/AggregatingRecordReader.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/AggregatingRecordReader.java Fri Jan 6 22:02:09 2012
@@ -0,0 +1,170 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.wikisearch.reader;
+
+
+import java.io.IOException;
+
+import org.apache.accumulo.wikisearch.ingest.WikipediaConfiguration;
+import org.apache.accumulo.wikisearch.util.TextUtil;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+
+
+/**
+ * This class aggregates Text values based on a start and end filter. An example use case for this would be XML data. This will not work with data that has
+ * nested start and stop tokens.
+ *
+ */
+public class AggregatingRecordReader extends LongLineRecordReader {
+
+ public static final String START_TOKEN = "aggregating.token.start";
+ public static final String END_TOKEN = "aggregating.token.end";
+ public static final String RETURN_PARTIAL_MATCHES = "aggregating.allow.partial";
+
+ private LongWritable key = new LongWritable();
+ private String startToken = null;
+ private String endToken = null;
+ private long counter = 0;
+ private Text aggValue = new Text();
+ private boolean startFound = false;
+ private StringBuilder remainder = new StringBuilder(0);
+ private boolean returnPartialMatches = false;
+
+ @Override
+ public LongWritable getCurrentKey() {
+ key.set(counter);
+ return key;
+ }
+
+ @Override
+ public Text getCurrentValue() {
+ return aggValue;
+ }
+
+ @Override
+ public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
+ super.initialize(genericSplit, context);
+ this.startToken = WikipediaConfiguration.isNull(context.getConfiguration(), START_TOKEN, String.class);
+ this.endToken = WikipediaConfiguration.isNull(context.getConfiguration(), END_TOKEN, String.class);
+ this.returnPartialMatches = context.getConfiguration().getBoolean(RETURN_PARTIAL_MATCHES, false);
+
+ /*
+ * Text-appending works almost exactly like the + operator on Strings- it creates a byte array exactly the size of [prefix + suffix] and dumps the bytes
+ * into the new array. This module works by doing lots of little additions, one line at a time. With most XML, the documents are partitioned on line
+ * boundaries, so we will generally have lots of additions. Setting a large default byte array for a text object can avoid this and give us
+ * StringBuilder-like functionality for Text objects.
+ */
+ byte[] txtBuffer = new byte[2048];
+ aggValue.set(txtBuffer);
+ }
+
+ @Override
+ public boolean nextKeyValue() throws IOException {
+ aggValue.clear();
+ boolean hasNext = false;
+ boolean finished = false;
+ // Find the start token
+ while (!finished && (((hasNext = super.nextKeyValue()) == true) || remainder.length() > 0)) {
+ if (hasNext)
+ finished = process(super.getCurrentValue());
+ else
+ finished = process(null);
+ if (finished) {
+ startFound = false;
+ counter++;
+ return true;
+ }
+ }
+ // If we have anything loaded in the agg value (and we found a start)
+ // then we ran out of data before finding the end. Just return the
+ // data we have and if it's not valid, downstream parsing of the data
+ // will fail.
+ if (returnPartialMatches && startFound && aggValue.getLength() > 0) {
+ startFound = false;
+ counter++;
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Populates aggValue with the contents of the Text object.
+ *
+ * @param t
+ * @return true if aggValue is complete, else false and needs more data.
+ */
+ private boolean process(Text t) {
+
+ if (null != t)
+ remainder.append(t.toString());
+ while (remainder.length() > 0) {
+ if (!startFound) {
+ // If found, then begin aggregating at the start offset
+ int start = remainder.indexOf(startToken);
+ if (-1 != start) {
+ // Append the start token to the aggregate value
+ TextUtil.textAppendNoNull(aggValue, remainder.substring(start, start + startToken.length()), false);
+ // Remove to the end of the start token from the remainder
+ remainder.delete(0, start + startToken.length());
+ startFound = true;
+ } else {
+ // If we are looking for the start and have not found it, then remove
+ // the bytes
+ remainder.delete(0, remainder.length());
+ }
+ } else {
+ // Try to find the end
+ int end = remainder.indexOf(endToken);
+ // Also try to find the start
+ int start = remainder.indexOf(startToken);
+ if (-1 == end) {
+ if (returnPartialMatches && start >= 0) {
+ // End token not found, but another start token was found...
+ // The amount to copy is up to the beginning of the next start token
+ TextUtil.textAppendNoNull(aggValue, remainder.substring(0, start), false);
+ remainder.delete(0, start);
+ return true;
+ } else {
+ // Not found, aggregate the entire remainder
+ TextUtil.textAppendNoNull(aggValue, remainder.toString(), false);
+ // Delete all chars from remainder
+ remainder.delete(0, remainder.length());
+ }
+ } else {
+ if (returnPartialMatches && start >= 0 && start < end) {
+ // We found the end token, but found another start token first, so
+ // deal with that.
+ TextUtil.textAppendNoNull(aggValue, remainder.substring(0, start), false);
+ remainder.delete(0, start);
+ return true;
+ } else {
+ // END_TOKEN was found. Extract to the end of END_TOKEN
+ TextUtil.textAppendNoNull(aggValue, remainder.substring(0, end + endToken.length()), false);
+ // Remove from remainder up to the end of END_TOKEN
+ remainder.delete(0, end + endToken.length());
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+}
Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/AggregatingRecordReader.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/LfLineReader.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/LfLineReader.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/LfLineReader.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/LfLineReader.java Fri Jan 6 22:02:09 2012
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.wikisearch.reader;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+
+/**
+ * A class that provides a line reader from an input stream.
+ */
+public class LfLineReader {
+ private static final int DEFAULT_BUFFER_SIZE = 64 * 1024;
+ private int bufferSize = DEFAULT_BUFFER_SIZE;
+ private InputStream in;
+ private byte[] buffer;
+ // the number of bytes of real data in the buffer
+ private int bufferLength = 0;
+ // the current position in the buffer
+ private int bufferPosn = 0;
+
+ private static final byte LF = '\n';
+
+ /**
+ * Create a line reader that reads from the given stream using the default buffer-size (64k).
+ *
+ * @param in
+ * The input stream
+ * @throws IOException
+ */
+ public LfLineReader(InputStream in) {
+ this(in, DEFAULT_BUFFER_SIZE);
+ }
+
+ /**
+ * Create a line reader that reads from the given stream using the given buffer-size.
+ *
+ * @param in
+ * The input stream
+ * @param bufferSize
+ * Size of the read buffer
+ * @throws IOException
+ */
+ public LfLineReader(InputStream in, int bufferSize) {
+ this.in = in;
+ this.bufferSize = bufferSize;
+ this.buffer = new byte[this.bufferSize];
+ }
+
+ /**
+ * Create a line reader that reads from the given stream using the <code>io.file.buffer.size</code> specified in the given <code>Configuration</code>.
+ *
+ * @param in
+ * input stream
+ * @param conf
+ * configuration
+ * @throws IOException
+ */
+ public LfLineReader(InputStream in, Configuration conf) throws IOException {
+ this(in, conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE));
+ }
+
+ /**
+ * Close the underlying stream.
+ *
+ * @throws IOException
+ */
+ public void close() throws IOException {
+ in.close();
+ }
+
+ /**
+ * Read one line from the InputStream into the given Text. A line can be terminated by '\n' (LF). EOF also terminates an otherwise unterminated line.
+ *
+ * @param str
+ * the object to store the given line (without newline)
+ * @param maxLineLength
+ * the maximum number of bytes to store into str; the rest of the line is silently discarded.
+ * @param maxBytesToConsume
+ * the maximum number of bytes to consume in this call. This is only a hint, because if the line cross this threshold, we allow it to happen. It can
+ * overshoot potentially by as much as one buffer length.
+ *
+ * @return the number of bytes read including the (longest) newline found.
+ *
+ * @throws IOException
+ * if the underlying stream throws
+ */
+ public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException {
+ /*
+ * We're reading data from in, but the head of the stream may be already buffered in buffer, so we have several cases: 1. No newline characters are in the
+ * buffer, so we need to copy everything and read another buffer from the stream. 2. An unambiguously terminated line is in buffer, so we just copy to str.
+ */
+ str.clear();
+ int txtLength = 0; // tracks str.getLength(), as an optimization
+ int newlineLength = 0; // length of terminating newline
+ long bytesConsumed = 0;
+ do {
+ int startPosn = bufferPosn; // starting from where we left off the last time
+ if (bufferPosn >= bufferLength) {
+ startPosn = bufferPosn = 0;
+ bufferLength = in.read(buffer);
+ if (bufferLength <= 0)
+ break; // EOF
+ }
+ for (; bufferPosn < bufferLength; ++bufferPosn) { // search for newline
+ if (buffer[bufferPosn] == LF) {
+ newlineLength = 1;
+ ++bufferPosn; // at next invocation proceed from following byte
+ break;
+ }
+ }
+ int readLength = bufferPosn - startPosn;
+ bytesConsumed += readLength;
+ int appendLength = readLength - newlineLength;
+ if (appendLength > maxLineLength - txtLength) {
+ appendLength = maxLineLength - txtLength;
+ }
+ if (appendLength > 0) {
+ str.append(buffer, startPosn, appendLength);
+ txtLength += appendLength;
+ }
+ } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);
+
+ if (bytesConsumed > Integer.MAX_VALUE)
+ throw new IOException("Too many bytes before newline: " + bytesConsumed);
+ return (int) bytesConsumed;
+ }
+
+ /**
+ * Read from the InputStream into the given Text.
+ *
+ * @param str
+ * the object to store the given line
+ * @param maxLineLength
+ * the maximum number of bytes to store into str.
+ * @return the number of bytes read including the newline
+ * @throws IOException
+ * if the underlying stream throws
+ */
+ public int readLine(Text str, int maxLineLength) throws IOException {
+ return readLine(str, maxLineLength, Integer.MAX_VALUE);
+ }
+
+ /**
+ * Read from the InputStream into the given Text.
+ *
+ * @param str
+ * the object to store the given line
+ * @return the number of bytes read including the newline
+ * @throws IOException
+ * if the underlying stream throws
+ */
+ public int readLine(Text str) throws IOException {
+ return readLine(str, Integer.MAX_VALUE, Integer.MAX_VALUE);
+ }
+
+}
Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/LfLineReader.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/LongLineRecordReader.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/LongLineRecordReader.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/LongLineRecordReader.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/LongLineRecordReader.java Fri Jan 6 22:02:09 2012
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.wikisearch.reader;
+
+import java.io.IOException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.CompressionCodec;
+import org.apache.hadoop.io.compress.CompressionCodecFactory;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+import org.apache.hadoop.mapreduce.lib.input.FileSplit;
+import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
+import org.apache.hadoop.util.LineReader;
+
+/**
+ * A copy of {@link LineRecordReader} which does not discard lines longer than "mapred.linerecordreader.maxlength". Instead, it returns them, leaving it to the
+ * mapper to decide what to do with it. It also does not treat '\r' (CR) characters as new lines -- it uses {@link LfLineReader} instead of {@link LineReader}
+ * to read lines.
+ */
+public class LongLineRecordReader extends RecordReader<LongWritable,Text> {
+ private CompressionCodecFactory compressionCodecs = null;
+ private long start;
+ private long pos;
+ private long end;
+ private LfLineReader in;
+ private int maxLineLength;
+ private LongWritable key = null;
+ private Text value = null;
+
+ @Override
+ public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
+ FileSplit split = (FileSplit) genericSplit;
+ Configuration job = context.getConfiguration();
+ this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
+ start = split.getStart();
+ end = start + split.getLength();
+ final Path file = split.getPath();
+ compressionCodecs = new CompressionCodecFactory(job);
+ final CompressionCodec codec = compressionCodecs.getCodec(file);
+
+ // open the file and seek to the start of the split
+ FileSystem fs = file.getFileSystem(job);
+ FSDataInputStream fileIn = fs.open(split.getPath());
+ boolean skipFirstLine = false;
+ if (codec != null) {
+ in = new LfLineReader(codec.createInputStream(fileIn), job);
+ end = Long.MAX_VALUE;
+ } else {
+ if (start != 0) {
+ skipFirstLine = true;
+ --start;
+ fileIn.seek(start);
+ }
+ in = new LfLineReader(fileIn, job);
+ }
+ if (skipFirstLine) { // skip first line and re-establish "start".
+ start += in.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
+ }
+ this.pos = start;
+ }
+
+ @Override
+ public boolean nextKeyValue() throws IOException {
+ if (key == null) {
+ key = new LongWritable();
+ }
+ key.set(pos);
+ if (value == null) {
+ value = new Text();
+ }
+ int newSize = 0;
+ if (pos < end) {
+ newSize = in.readLine(value, maxLineLength, Math.max((int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength));
+ if (newSize != 0) {
+ pos += newSize;
+ }
+ }
+ if (newSize == 0) {
+ key = null;
+ value = null;
+ return false;
+ } else {
+ return true;
+ }
+ }
+
+ @Override
+ public LongWritable getCurrentKey() {
+ return key;
+ }
+
+ @Override
+ public Text getCurrentValue() {
+ return value;
+ }
+
+ /**
+ * Get the progress within the split
+ */
+ @Override
+ public float getProgress() {
+ if (start == end) {
+ return 0.0f;
+ } else {
+ return Math.min(1.0f, (pos - start) / (float) (end - start));
+ }
+ }
+
+ @Override
+ public synchronized void close() throws IOException {
+ if (in != null) {
+ in.close();
+ }
+ }
+}
Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/reader/LongLineRecordReader.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/util/TextUtil.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/util/TextUtil.java?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/util/TextUtil.java (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/util/TextUtil.java Fri Jan 6 22:02:09 2012
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.accumulo.wikisearch.util;
+
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import org.apache.hadoop.io.Text;
+import org.apache.accumulo.core.iterators.aggregation.LongSummation;
+
+public class TextUtil {
+
+ /**
+ * Appends a null byte followed by the UTF-8 bytes of the given string to the given {@link Text}
+ *
+ * @param text
+ * the Text to which to append
+ * @param string
+ * the String to append
+ */
+ public static void textAppend(Text text, String string) {
+ appendNullByte(text);
+ textAppendNoNull(text, string);
+ }
+
+ public static void textAppend(Text text, String string, boolean replaceBadChar) {
+ appendNullByte(text);
+ textAppendNoNull(text, string, replaceBadChar);
+ }
+
+ public static void textAppend(Text t, long s) {
+ t.append(nullByte, 0, 1);
+ t.append(LongSummation.longToBytes(s), 0, 8);
+ }
+
+ private static final byte[] nullByte = {0};
+
+ /**
+ * Appends a null byte to the given text
+ *
+ * @param text
+ * the text to which to append the null byte
+ */
+ public static void appendNullByte(Text text) {
+ text.append(nullByte, 0, nullByte.length);
+ }
+
+ /**
+ * Appends the UTF-8 bytes of the given string to the given {@link Text}
+ *
+ * @param text
+ * the Text to which to append
+ * @param string
+ * the String to append
+ */
+ public static void textAppendNoNull(Text t, String s) {
+ textAppendNoNull(t, s, false);
+ }
+
+ /**
+ * Appends the UTF-8 bytes of the given string to the given {@link Text}
+ *
+ * @param t
+ * @param s
+ * @param replaceBadChar
+ */
+ public static void textAppendNoNull(Text t, String s, boolean replaceBadChar) {
+ try {
+ ByteBuffer buffer = Text.encode(s, replaceBadChar);
+ t.append(buffer.array(), 0, buffer.limit());
+ } catch (CharacterCodingException cce) {
+ throw new IllegalArgumentException(cce);
+ }
+ }
+
+ /**
+ * Converts the given string its UTF-8 bytes. This uses Hadoop's method for converting string to UTF-8 and is much faster than calling
+ * {@link String#getBytes(String)}.
+ *
+ * @param string
+ * the string to convert
+ * @return the UTF-8 representation of the string
+ */
+ public static byte[] toUtf8(String string) {
+ ByteBuffer buffer;
+ try {
+ buffer = Text.encode(string, false);
+ } catch (CharacterCodingException cce) {
+ throw new IllegalArgumentException(cce);
+ }
+ byte[] bytes = new byte[buffer.limit()];
+ System.arraycopy(buffer.array(), 0, bytes, 0, bytes.length);
+ return bytes;
+ }
+}
Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/java/org/apache/accumulo/wikisearch/util/TextUtil.java
------------------------------------------------------------------------------
svn:eol-style = native
Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/TermWeight.proto
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/TermWeight.proto?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/TermWeight.proto (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/TermWeight.proto Fri Jan 6 22:02:09 2012
@@ -0,0 +1,28 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// compile with protoc --java_out ../java
+// compile extra builder util with java accumulo.data.protobuf.builder.ProtoBufBuilder -d ../java accumulo.data.protobuf.UidList
+// classpath for compile command should include ../../../target/classes and protobuf-java-2.2.0.jar
+
+package protobuf;
+
+option java_package = "protobuf";
+option optimize_for = SPEED;
+
+message Info {
+ required float normalizedTermFrequency = 1;
+ repeated uint32 wordOffset = 2;
+}
Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/Uid.proto
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/Uid.proto?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/Uid.proto (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/Uid.proto Fri Jan 6 22:02:09 2012
@@ -0,0 +1,29 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// compile with protoc --java_out ../java
+// compile extra builder util with java accumulo.data.protobuf.builder.ProtoBufBuilder -d ../java accumulo.data.protobuf.UidList
+// classpath for compile command should include ../../../target/classes and protobuf-java-2.2.0.jar
+
+package protobuf;
+
+option java_package = "protobuf";
+option optimize_for = SPEED;
+
+message List {
+ required bool IGNORE = 1;
+ required uint64 COUNT = 2;
+ repeated string UID = 3;
+}
Added: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/compile_protos.sh
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/compile_protos.sh?rev=1228459&view=auto
==============================================================================
--- incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/compile_protos.sh (added)
+++ incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/compile_protos.sh Fri Jan 6 22:02:09 2012
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+for PROTO in `ls -1 *proto`; do protoc --java_out ../java $PROTO; done
Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/compile_protos.sh
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/accumulo/branches/1.4/src/wikisearch/ingest/src/main/protobuf/compile_protos.sh
------------------------------------------------------------------------------
svn:executable = *