You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@accumulo.apache.org by ec...@apache.org on 2013/11/26 16:48:04 UTC
[15/39] git commit: ACCUMULO-600 removed wikisearch from trunk
ACCUMULO-600 removed wikisearch from trunk
git-svn-id: https://svn.apache.org/repos/asf/accumulo/trunk@1370489 13f79535-47bb-0310-9956-ffa450edef68
(cherry picked from commit d1e5c6ebe2796154b514ec8f147720d70b3800b5)
Reason: Maintainability
Author: Billie Rinaldi <bi...@apache.org>
Ref: ACCUMULO-1792
Differs from upstream by leaving a tombstone marker pointing to the contrib project.
Author: Sean Busbey <bu...@cloudera.com>
Signed-off-by: Eric Newton <er...@gmail.com>
Project: http://git-wip-us.apache.org/repos/asf/accumulo/repo
Commit: http://git-wip-us.apache.org/repos/asf/accumulo/commit/8db62992
Tree: http://git-wip-us.apache.org/repos/asf/accumulo/tree/8db62992
Diff: http://git-wip-us.apache.org/repos/asf/accumulo/diff/8db62992
Branch: refs/heads/1.6.0-SNAPSHOT
Commit: 8db629923cf4f89b5055f80aa1f39251fd63b25c
Parents: 7fa0085
Author: Sean Busbey <bu...@cloudera.com>
Authored: Tue Oct 22 13:21:55 2013 -0500
Committer: Eric Newton <er...@gmail.com>
Committed: Mon Nov 25 16:06:42 2013 -0500
----------------------------------------------------------------------
.../1GB/native-standalone/generic_logger.xml | 4 -
conf/examples/1GB/standalone/generic_logger.xml | 4 -
.../2GB/native-standalone/generic_logger.xml | 4 -
conf/examples/2GB/standalone/generic_logger.xml | 4 -
.../3GB/native-standalone/generic_logger.xml | 4 -
conf/examples/3GB/standalone/generic_logger.xml | 4 -
.../512MB/native-standalone/generic_logger.xml | 4 -
.../512MB/standalone/generic_logger.xml | 4 -
src/examples/pom.xml | 1 -
src/examples/wikisearch/README | 69 +-
src/examples/wikisearch/README.parallel | 65 -
src/examples/wikisearch/ingest/bin/ingest.sh | 46 -
.../wikisearch/ingest/bin/ingest_parallel.sh | 46 -
.../ingest/conf/wikipedia.xml.example | 43 -
.../ingest/conf/wikipedia_parallel.xml.example | 75 -
src/examples/wikisearch/ingest/pom.xml | 160 --
.../wikisearch/ingest/src/assembly/dist.xml | 38 -
.../wikisearch/ingest/ArticleExtractor.java | 207 --
.../wikisearch/ingest/LRUOutputCombiner.java | 75 -
.../ingest/WikipediaConfiguration.java | 198 --
.../wikisearch/ingest/WikipediaIngester.java | 206 --
.../wikisearch/ingest/WikipediaInputFormat.java | 136 --
.../wikisearch/ingest/WikipediaMapper.java | 245 ---
.../ingest/WikipediaPartitionedIngester.java | 310 ---
.../ingest/WikipediaPartitionedMapper.java | 310 ---
.../wikisearch/ingest/WikipediaPartitioner.java | 89 -
.../iterator/GlobalIndexUidCombiner.java | 94 -
.../wikisearch/iterator/TextIndexCombiner.java | 102 -
.../normalizer/LcNoDiacriticsNormalizer.java | 49 -
.../wikisearch/normalizer/NoOpNormalizer.java | 23 -
.../wikisearch/normalizer/Normalizer.java | 32 -
.../wikisearch/normalizer/NumberNormalizer.java | 42 -
.../output/BufferingRFileRecordWriter.java | 140 --
.../output/SortingRFileOutputFormat.java | 121 --
.../wikisearch/protobuf/TermWeight.java | 424 ----
.../examples/wikisearch/protobuf/Uid.java | 470 -----
.../reader/AggregatingRecordReader.java | 171 --
.../wikisearch/reader/LfLineReader.java | 173 --
.../wikisearch/reader/LongLineRecordReader.java | 136 --
.../examples/wikisearch/util/TextUtil.java | 109 -
.../ingest/src/main/protobuf/TermWeight.proto | 28 -
.../ingest/src/main/protobuf/Uid.proto | 29 -
.../ingest/src/main/protobuf/compile_protos.sh | 19 -
.../ingest/StandaloneStatusReporter.java | 70 -
.../ingest/WikipediaInputSplitTest.java | 69 -
.../wikisearch/ingest/WikipediaMapperTest.java | 163 --
.../wikisearch/iterator/GlobalIndexUidTest.java | 192 --
.../wikisearch/iterator/TextIndexTest.java | 185 --
.../normalizer/testNumberNormalizer.java | 90 -
.../reader/AggregatingRecordReaderTest.java | 287 ---
.../src/test/resources/enwiki-20110901-001.xml | 153 --
src/examples/wikisearch/pom.xml | 253 ---
src/examples/wikisearch/query-war/pom.xml | 66 -
.../src/main/webapp/WEB-INF/jboss-web.xml | 20 -
.../query-war/src/main/webapp/WEB-INF/web.xml | 57 -
.../query-war/src/main/webapp/style.xsl | 47 -
.../wikisearch/query-war/src/main/webapp/ui.jsp | 131 --
.../query-war/src/test/resources/test.xml | 1651 ---------------
src/examples/wikisearch/query/pom.xml | 180 --
.../wikisearch/query/src/assembly/dist.xml | 40 -
.../wikisearch/function/QueryFunctions.java | 68 -
.../iterator/AbstractEvaluatingIterator.java | 323 ---
.../wikisearch/iterator/AndIterator.java | 921 ---------
.../iterator/BooleanLogicIterator.java | 1949 ------------------
.../iterator/BooleanLogicTreeNode.java | 523 -----
.../iterator/DefaultIteratorEnvironment.java | 58 -
.../wikisearch/iterator/EvaluatingIterator.java | 115 --
.../wikisearch/iterator/FieldIndexIterator.java | 736 -------
.../iterator/OptimizedQueryIterator.java | 205 --
.../wikisearch/iterator/OrIterator.java | 822 --------
.../wikisearch/iterator/ReadAheadIterator.java | 297 ---
.../iterator/UniqFieldNameValueIterator.java | 342 ---
.../examples/wikisearch/jexl/Arithmetic.java | 126 --
.../wikisearch/logic/AbstractQueryLogic.java | 883 --------
.../examples/wikisearch/logic/ContentLogic.java | 109 -
.../examples/wikisearch/logic/QueryLogic.java | 195 --
.../examples/wikisearch/parser/EventFields.java | 227 --
.../parser/FieldIndexQueryReWriter.java | 1139 ----------
.../parser/JexlOperatorConstants.java | 105 -
.../wikisearch/parser/QueryEvaluator.java | 291 ---
.../examples/wikisearch/parser/QueryParser.java | 845 --------
.../wikisearch/parser/RangeCalculator.java | 1199 -----------
.../examples/wikisearch/parser/TreeBuilder.java | 675 ------
.../examples/wikisearch/parser/TreeNode.java | 235 ---
.../examples/wikisearch/query/IQuery.java | 66 -
.../examples/wikisearch/query/Query.java | 239 ---
.../examples/wikisearch/sample/Document.java | 61 -
.../examples/wikisearch/sample/Field.java | 58 -
.../examples/wikisearch/sample/Results.java | 53 -
.../examples/wikisearch/util/BaseKeyParser.java | 77 -
.../wikisearch/util/FieldIndexKeyParser.java | 71 -
.../examples/wikisearch/util/KeyParser.java | 70 -
.../src/main/resources/META-INF/MANIFEST.MF | 2 -
.../main/resources/META-INF/ejb-jar.xml.example | 62 -
.../logic/StandaloneStatusReporter.java | 70 -
.../wikisearch/logic/TestQueryLogic.java | 186 --
.../src/test/resources/enwiki-20110901-001.xml | 153 --
97 files changed, 1 insertion(+), 21722 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/conf/examples/1GB/native-standalone/generic_logger.xml
----------------------------------------------------------------------
diff --git a/conf/examples/1GB/native-standalone/generic_logger.xml b/conf/examples/1GB/native-standalone/generic_logger.xml
index dc45710..5dc38ac 100644
--- a/conf/examples/1GB/native-standalone/generic_logger.xml
+++ b/conf/examples/1GB/native-standalone/generic_logger.xml
@@ -69,10 +69,6 @@
<level value="INFO"/>
</logger>
- <logger name="org.apache.accumulo.examples.wikisearch">
- <level value="INFO"/>
- </logger>
-
<logger name="org.mortbay.log">
<level value="WARN"/>
</logger>
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/conf/examples/1GB/standalone/generic_logger.xml
----------------------------------------------------------------------
diff --git a/conf/examples/1GB/standalone/generic_logger.xml b/conf/examples/1GB/standalone/generic_logger.xml
index dc45710..5dc38ac 100644
--- a/conf/examples/1GB/standalone/generic_logger.xml
+++ b/conf/examples/1GB/standalone/generic_logger.xml
@@ -69,10 +69,6 @@
<level value="INFO"/>
</logger>
- <logger name="org.apache.accumulo.examples.wikisearch">
- <level value="INFO"/>
- </logger>
-
<logger name="org.mortbay.log">
<level value="WARN"/>
</logger>
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/conf/examples/2GB/native-standalone/generic_logger.xml
----------------------------------------------------------------------
diff --git a/conf/examples/2GB/native-standalone/generic_logger.xml b/conf/examples/2GB/native-standalone/generic_logger.xml
index dc45710..5dc38ac 100644
--- a/conf/examples/2GB/native-standalone/generic_logger.xml
+++ b/conf/examples/2GB/native-standalone/generic_logger.xml
@@ -69,10 +69,6 @@
<level value="INFO"/>
</logger>
- <logger name="org.apache.accumulo.examples.wikisearch">
- <level value="INFO"/>
- </logger>
-
<logger name="org.mortbay.log">
<level value="WARN"/>
</logger>
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/conf/examples/2GB/standalone/generic_logger.xml
----------------------------------------------------------------------
diff --git a/conf/examples/2GB/standalone/generic_logger.xml b/conf/examples/2GB/standalone/generic_logger.xml
index dc45710..5dc38ac 100644
--- a/conf/examples/2GB/standalone/generic_logger.xml
+++ b/conf/examples/2GB/standalone/generic_logger.xml
@@ -69,10 +69,6 @@
<level value="INFO"/>
</logger>
- <logger name="org.apache.accumulo.examples.wikisearch">
- <level value="INFO"/>
- </logger>
-
<logger name="org.mortbay.log">
<level value="WARN"/>
</logger>
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/conf/examples/3GB/native-standalone/generic_logger.xml
----------------------------------------------------------------------
diff --git a/conf/examples/3GB/native-standalone/generic_logger.xml b/conf/examples/3GB/native-standalone/generic_logger.xml
index dc45710..5dc38ac 100644
--- a/conf/examples/3GB/native-standalone/generic_logger.xml
+++ b/conf/examples/3GB/native-standalone/generic_logger.xml
@@ -69,10 +69,6 @@
<level value="INFO"/>
</logger>
- <logger name="org.apache.accumulo.examples.wikisearch">
- <level value="INFO"/>
- </logger>
-
<logger name="org.mortbay.log">
<level value="WARN"/>
</logger>
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/conf/examples/3GB/standalone/generic_logger.xml
----------------------------------------------------------------------
diff --git a/conf/examples/3GB/standalone/generic_logger.xml b/conf/examples/3GB/standalone/generic_logger.xml
index dc45710..5dc38ac 100644
--- a/conf/examples/3GB/standalone/generic_logger.xml
+++ b/conf/examples/3GB/standalone/generic_logger.xml
@@ -69,10 +69,6 @@
<level value="INFO"/>
</logger>
- <logger name="org.apache.accumulo.examples.wikisearch">
- <level value="INFO"/>
- </logger>
-
<logger name="org.mortbay.log">
<level value="WARN"/>
</logger>
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/conf/examples/512MB/native-standalone/generic_logger.xml
----------------------------------------------------------------------
diff --git a/conf/examples/512MB/native-standalone/generic_logger.xml b/conf/examples/512MB/native-standalone/generic_logger.xml
index dc45710..5dc38ac 100644
--- a/conf/examples/512MB/native-standalone/generic_logger.xml
+++ b/conf/examples/512MB/native-standalone/generic_logger.xml
@@ -69,10 +69,6 @@
<level value="INFO"/>
</logger>
- <logger name="org.apache.accumulo.examples.wikisearch">
- <level value="INFO"/>
- </logger>
-
<logger name="org.mortbay.log">
<level value="WARN"/>
</logger>
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/conf/examples/512MB/standalone/generic_logger.xml
----------------------------------------------------------------------
diff --git a/conf/examples/512MB/standalone/generic_logger.xml b/conf/examples/512MB/standalone/generic_logger.xml
index dc45710..5dc38ac 100644
--- a/conf/examples/512MB/standalone/generic_logger.xml
+++ b/conf/examples/512MB/standalone/generic_logger.xml
@@ -69,10 +69,6 @@
<level value="INFO"/>
</logger>
- <logger name="org.apache.accumulo.examples.wikisearch">
- <level value="INFO"/>
- </logger>
-
<logger name="org.mortbay.log">
<level value="WARN"/>
</logger>
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/pom.xml
----------------------------------------------------------------------
diff --git a/src/examples/pom.xml b/src/examples/pom.xml
index 2d56be6..0ec2184 100644
--- a/src/examples/pom.xml
+++ b/src/examples/pom.xml
@@ -29,7 +29,6 @@
<modules>
<module>simple</module>
- <module>wikisearch</module>
</modules>
<repositories>
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/README
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/README b/src/examples/wikisearch/README
index 041490f..806de3c 100644
--- a/src/examples/wikisearch/README
+++ b/src/examples/wikisearch/README
@@ -1,68 +1 @@
- Apache Accumulo Wikipedia Search Example
-
- This project contains a sample application for ingesting and querying wikipedia data.
-
-
- Ingest
- ------
-
- Prerequisites
- -------------
- 1. Accumulo, Hadoop, and ZooKeeper must be installed and running
- 2. One or more wikipedia dump files (http://dumps.wikimedia.org/backup-index.html) placed in an HDFS directory.
- You will want to grab the files with the link name of pages-articles.xml.bz2
- 3. Though not strictly required, the ingest will go more quickly if the files are decompressed:
-
- $ bunzip2 < enwiki-*-pages-articles.xml.bz2 | hadoop fs -put - /wikipedia/enwiki-pages-articles.xml
-
-
- INSTRUCTIONS
- ------------
- 1. Copy the ingest/conf/wikipedia.xml.example to ingest/conf/wikipedia.xml and change it to specify Accumulo information.
- 2. Copy the ingest/lib/wikisearch-*.jar and ingest/lib/protobuf*.jar to $ACCUMULO_HOME/lib/ext
- 3. Then run ingest/bin/ingest.sh with one argument (the name of the directory in HDFS where the wikipedia XML
- files reside) and this will kick off a MapReduce job to ingest the data into Accumulo.
-
- Query
- -----
-
- Prerequisites
- -------------
- 1. The query software was tested using JBoss AS 6. Install this unless you feel like messing with the installation.
-
- NOTE: Ran into a bug (https://issues.jboss.org/browse/RESTEASY-531) that did not allow an EJB3.1 war file. The
- workaround is to separate the RESTEasy servlet from the EJBs by creating an EJB jar and a WAR file.
-
- INSTRUCTIONS
- -------------
- 1. Copy the query/src/main/resources/META-INF/ejb-jar.xml.example file to
- query/src/main/resources/META-INF/ejb-jar.xml. Modify to the file to contain the same
- information that you put into the wikipedia.xml file from the Ingest step above.
- 2. Re-build the query distribution by running 'mvn package assembly:single' in the top-level directory.
- 3. Untar the resulting file in the $JBOSS_HOME/server/default directory.
-
- $ cd $JBOSS_HOME/server/default
- $ tar -xzf $ACCUMULO_HOME/src/examples/wikisearch/query/target/wikisearch-query*.tar.gz
-
- This will place the dependent jars in the lib directory and the EJB jar into the deploy directory.
- 4. Next, copy the wikisearch*.war file in the query-war/target directory to $JBOSS_HOME/server/default/deploy.
- 5. Start JBoss ($JBOSS_HOME/bin/run.sh)
- 6. Use the Accumulo shell and give the user permissions for the wikis that you loaded, for example:
- setauths -u <user> -s all,enwiki,eswiki,frwiki,fawiki
- 7. Copy the following jars to the $ACCUMULO_HOME/lib/ext directory from the $JBOSS_HOME/server/default/lib directory:
-
- commons-lang*.jar
- kryo*.jar
- minlog*.jar
- commons-jexl*.jar
- google-collections*.jar
-
- 8. Copy the $JBOSS_HOME/server/default/deploy/wikisearch-query*.jar to $ACCUMULO_HOME/lib/ext.
-
-
- 9. At this point you should be able to open a browser and view the page: http://localhost:8080/accumulo-wikisearch/ui/ui.jsp.
- You can issue the queries using this user interface or via the following REST urls: <host>/accumulo-wikisearch/rest/Query/xml,
- <host>/accumulo-wikisearch/rest/Query/html, <host>/accumulo-wikisearch/rest/Query/yaml, or <host>/accumulo-wikisearch/rest/Query/json.
- There are two parameters to the REST service, query and auths. The query parameter is the same string that you would type
- into the search box at ui.jsp, and the auths parameter is a comma-separated list of wikis that you want to search (i.e.
- enwiki,frwiki,dewiki, etc. Or you can use all)
+The Accumulo Wikipedia Search Example has moved to [a contrib project](http://accumulo.apache.org/contrib.html). For more information, see [ACCUMULO-600](https://issues.apache.org/jira/browse/ACCUMULO-600) and the [wikisearch contrib repository](https://git-wip-us.apache.org/repos/asf?p=accumulo-wikisearch.git;a=summary).
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/README.parallel
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/README.parallel b/src/examples/wikisearch/README.parallel
deleted file mode 100644
index 477556b..0000000
--- a/src/examples/wikisearch/README.parallel
+++ /dev/null
@@ -1,65 +0,0 @@
- Apache Accumulo Wikipedia Search Example (parallel version)
-
- This project contains a sample application for ingesting and querying wikipedia data.
-
-
- Ingest
- ------
-
- Prerequisites
- -------------
- 1. Accumulo, Hadoop, and ZooKeeper must be installed and running
- 2. One or more wikipedia dump files (http://dumps.wikimedia.org/backup-index.html) placed in an HDFS directory.
- You will want to grab the files with the link name of pages-articles.xml.bz2
-
-
- INSTRUCTIONS
- ------------
- 1. Copy the ingest/conf/wikipedia_parallel.xml.example to ingest/conf/wikipedia.xml and change it to specify Accumulo information.
- 2. Copy the ingest/lib/wikisearch-*.jar and ingest/lib/protobuf*.jar to $ACCUMULO_HOME/lib/ext
- 3. Then run ingest/bin/ingest_parallel.sh with one argument (the name of the directory in HDFS where the wikipedia XML
- files reside) and this will kick off a MapReduce job to ingest the data into Accumulo.
-
- Query
- -----
-
- Prerequisites
- -------------
- 1. The query software was tested using JBoss AS 6. Install this unless you feel like messing with the installation.
-
- NOTE: Ran into a bug (https://issues.jboss.org/browse/RESTEASY-531) that did not allow an EJB3.1 war file. The
- workaround is to separate the RESTEasy servlet from the EJBs by creating an EJB jar and a WAR file.
-
- INSTRUCTIONS
- -------------
- 1. Copy the query/src/main/resources/META-INF/ejb-jar.xml.example file to
- query/src/main/resources/META-INF/ejb-jar.xml. Modify to the file to contain the same
- information that you put into the wikipedia.xml file from the Ingest step above.
- 2. Re-build the query distribution by running 'mvn package assembly:single' in the top-level directory.
- 3. Untar the resulting file in the $JBOSS_HOME/server/default directory.
-
- $ cd $JBOSS_HOME/server/default
- $ tar -xzf $ACCUMULO_HOME/src/examples/wikisearch/query/target/wikisearch-query*.tar.gz
-
- This will place the dependent jars in the lib directory and the EJB jar into the deploy directory.
- 4. Next, copy the wikisearch*.war file in the query-war/target directory to $JBOSS_HOME/server/default/deploy.
- 5. Start JBoss ($JBOSS_HOME/bin/run.sh)
- 6. Use the Accumulo shell and give the user permissions for the wikis that you loaded, for example:
- setauths -u <user> -s all,enwiki,eswiki,frwiki,fawiki
- 7. Copy the following jars to the $ACCUMULO_HOME/lib/ext directory from the $JBOSS_HOME/server/default/lib directory:
-
- commons-lang*.jar
- kryo*.jar
- minlog*.jar
- commons-jexl*.jar
- google-collections*.jar
-
- 8. Copy the $JBOSS_HOME/server/default/deploy/wikisearch-query*.jar to $ACCUMULO_HOME/lib/ext.
-
-
- 9. At this point you should be able to open a browser and view the page: http://localhost:8080/accumulo-wikisearch/ui/ui.jsp.
- You can issue the queries using this user interface or via the following REST urls: <host>/accumulo-wikisearch/rest/Query/xml,
- <host>/accumulo-wikisearch/rest/Query/html, <host>/accumulo-wikisearch/rest/Query/yaml, or <host>/accumulo-wikisearch/rest/Query/json.
- There are two parameters to the REST service, query and auths. The query parameter is the same string that you would type
- into the search box at ui.jsp, and the auths parameter is a comma-separated list of wikis that you want to search (i.e.
- enwiki,frwiki,dewiki, etc. Or you can use all)
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/bin/ingest.sh
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/bin/ingest.sh b/src/examples/wikisearch/ingest/bin/ingest.sh
deleted file mode 100755
index acdcbf8..0000000
--- a/src/examples/wikisearch/ingest/bin/ingest.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-
-THIS_SCRIPT="$0"
-SCRIPT_DIR="${THIS_SCRIPT%/*}"
-SCRIPT_DIR=`cd $SCRIPT_DIR ; pwd`
-echo $SCRIPT_DIR
-
-#
-# Add our jars
-#
-for f in $SCRIPT_DIR/../lib/*.jar; do
- CLASSPATH=${CLASSPATH}:$f
-done
-
-#
-# Transform the classpath into a comma-separated list also
-#
-LIBJARS=`echo $CLASSPATH | sed 's/^://' | sed 's/:/,/g'`
-
-
-#
-# Map/Reduce job
-#
-JAR=$SCRIPT_DIR/../lib/wikisearch-ingest-1.4.5-SNAPSHOT.jar
-CONF=$SCRIPT_DIR/../conf/wikipedia.xml
-HDFS_DATA_DIR=$1
-export HADOOP_CLASSPATH=$CLASSPATH
-echo "hadoop jar $JAR org.apache.accumulo.examples.wikisearch.ingest.WikipediaIngester -libjars $LIBJARS -conf $CONF -Dwikipedia.input=${HDFS_DATA_DIR}"
-hadoop jar $JAR org.apache.accumulo.examples.wikisearch.ingest.WikipediaIngester -libjars $LIBJARS -conf $CONF -Dwikipedia.input=${HDFS_DATA_DIR}
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/bin/ingest_parallel.sh
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/bin/ingest_parallel.sh b/src/examples/wikisearch/ingest/bin/ingest_parallel.sh
deleted file mode 100755
index 8c63ac0..0000000
--- a/src/examples/wikisearch/ingest/bin/ingest_parallel.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-
-THIS_SCRIPT="$0"
-SCRIPT_DIR="${THIS_SCRIPT%/*}"
-SCRIPT_DIR=`cd $SCRIPT_DIR ; pwd`
-echo $SCRIPT_DIR
-
-#
-# Add our jars
-#
-for f in $SCRIPT_DIR/../lib/*.jar; do
- CLASSPATH=${CLASSPATH}:$f
-done
-
-#
-# Transform the classpath into a comma-separated list also
-#
-LIBJARS=`echo $CLASSPATH | sed 's/^://' | sed 's/:/,/g'`
-
-
-#
-# Map/Reduce job
-#
-JAR=$SCRIPT_DIR/../lib/wikisearch-ingest-1.4.5-SNAPSHOT.jar
-CONF=$SCRIPT_DIR/../conf/wikipedia.xml
-HDFS_DATA_DIR=$1
-export HADOOP_CLASSPATH=$CLASSPATH
-echo "hadoop jar $JAR org.apache.accumulo.examples.wikisearch.ingest.WikipediaPartitionedIngester -libjars $LIBJARS -conf $CONF -Dwikipedia.input=${HDFS_DATA_DIR}"
-hadoop jar $JAR org.apache.accumulo.examples.wikisearch.ingest.WikipediaPartitionedIngester -libjars $LIBJARS -conf $CONF -Dwikipedia.input=${HDFS_DATA_DIR}
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/conf/wikipedia.xml.example
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/conf/wikipedia.xml.example b/src/examples/wikisearch/ingest/conf/wikipedia.xml.example
deleted file mode 100644
index b08742e..0000000
--- a/src/examples/wikisearch/ingest/conf/wikipedia.xml.example
+++ /dev/null
@@ -1,43 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<configuration>
- <property>
- <name>wikipedia.accumulo.zookeepers</name>
- <value><!--zookeeper servers --></value>
- </property>
- <property>
- <name>wikipedia.accumulo.instance_name</name>
- <value><!--instance name --></value>
- </property>
- <property>
- <name>wikipedia.accumulo.user</name>
- <value><!--user name --></value>
- </property>
- <property>
- <name>wikipedia.accumulo.password</name>
- <value><!-- password --></value>
- </property>
- <property>
- <name>wikipedia.accumulo.table</name>
- <value><!--table name --></value>
- </property>
- <property>
- <name>wikipedia.ingest.partitions</name>
- <value><!--number of partitions --></value>
- </property>
-</configuration>
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/conf/wikipedia_parallel.xml.example
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/conf/wikipedia_parallel.xml.example b/src/examples/wikisearch/ingest/conf/wikipedia_parallel.xml.example
deleted file mode 100644
index 53220f0..0000000
--- a/src/examples/wikisearch/ingest/conf/wikipedia_parallel.xml.example
+++ /dev/null
@@ -1,75 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<configuration>
- <property>
- <name>wikipedia.accumulo.zookeepers</name>
- <value><!--zookeeper servers --></value>
- </property>
- <property>
- <name>wikipedia.accumulo.instance_name</name>
- <value><!--instance name --></value>
- </property>
- <property>
- <name>wikipedia.accumulo.user</name>
- <value><!--user name --></value>
- </property>
- <property>
- <name>wikipedia.accumulo.password</name>
- <value><!-- password --></value>
- </property>
- <property>
- <name>wikipedia.accumulo.table</name>
- <value><!--table name --></value>
- </property>
- <property>
- <name>wikipedia.ingest.partitions</name>
- <value><!--number of partitions --></value>
- </property>
- <property>
- <name>wikipedia.partitioned.directory</name>
- <value><!--hdfs directory for intemediate partitioned storage --></value>
- </property>
- <property>
- <name>wikipedia.ingest.groups</name>
- <value><!--the number of intermediate partition groups to generate --></value>
- </property>
- <property>
- <name>wikipedia.run.partitioner</name>
- <value><!--whether to run the partitioner step --></value>
- </property>
- <property>
- <name>wikipedia.run.ingest</name>
- <value><!--whether to run the ingest step --></value>
- </property>
- <property>
- <name>wikipedia.bulk.ingest</name>
- <value><!--whether to use bulk ingest vice streaming ingest --></value>
- </property>
- <property>
- <name>wikipedia.bulk.ingest.dir</name>
- <value><!--the directory to store rfiles for bulk ingest --></value>
- </property>
- <property>
- <name>wikipedia.bulk.ingest.failure.dir</name>
- <value><!--the directory to store failed rfiles after bulk ingest --></value>
- </property>
- <property>
- <name>wikipedia.bulk.ingest.buffer.size</name>
- <value><!--the ammount of memory to use for buffering and sorting key/value pairs in each mapper before writing rfiles --></value>
- </property>
-</configuration>
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/pom.xml
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/pom.xml b/src/examples/wikisearch/ingest/pom.xml
deleted file mode 100644
index 31d7110..0000000
--- a/src/examples/wikisearch/ingest/pom.xml
+++ /dev/null
@@ -1,160 +0,0 @@
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- -->
-
- <modelVersion>4.0.0</modelVersion>
- <parent>
- <artifactId>accumulo-wikisearch</artifactId>
- <groupId>org.apache.accumulo</groupId>
- <version>1.4.5-SNAPSHOT</version>
- <relativePath>../</relativePath>
- </parent>
-
- <artifactId>wikisearch-ingest</artifactId>
- <name>wikisearch-ingest</name>
-
- <dependencies>
- <dependency>
- <groupId>org.apache.zookeeper</groupId>
- <artifactId>zookeeper</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.accumulo</groupId>
- <artifactId>accumulo-core</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.accumulo</groupId>
- <artifactId>accumulo-start</artifactId>
- </dependency>
- <dependency>
- <groupId>log4j</groupId>
- <artifactId>log4j</artifactId>
- </dependency>
- <dependency>
- <groupId>commons-lang</groupId>
- <artifactId>commons-lang</artifactId>
- </dependency>
- <dependency>
- <groupId>com.google.collections</groupId>
- <artifactId>google-collections</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-core</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-analyzers</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-wikipedia</artifactId>
- </dependency>
- <dependency>
- <groupId>com.google.protobuf</groupId>
- <artifactId>protobuf-java</artifactId>
- </dependency>
- <dependency>
- <groupId>com.sun.jersey</groupId>
- <artifactId>jersey-server</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.accumulo</groupId>
- <artifactId>cloudtrace</artifactId>
- <scope>runtime</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.thrift</groupId>
- <artifactId>libthrift</artifactId>
- <scope>runtime</scope>
- </dependency>
- <dependency>
- <groupId>commons-codec</groupId>
- <artifactId>commons-codec</artifactId>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- <executions>
- <execution>
- <id>copy-dependencies</id>
- <phase>process-resources</phase>
- <goals>
- <goal>copy-dependencies</goal>
- </goals>
- <configuration>
- <outputDirectory>lib</outputDirectory>
- <!-- just grab the non-provided runtime dependencies -->
- <includeArtifactIds>commons-lang,google-collections,lucene-core,lucene-analyzers,lucene-wikipedia,protobuf-java,accumulo-core,hadoop-core,libthrift,cloudtrace,zookeeper,commons-codec</includeArtifactIds>
- <excludeTransitive>false</excludeTransitive>
- </configuration>
- </execution>
- </executions>
- </plugin>
- <plugin>
- <artifactId>maven-assembly-plugin</artifactId>
- <configuration>
- <descriptors>
- <descriptor>src/assembly/dist.xml</descriptor>
- </descriptors>
- <tarLongFileMode>gnu</tarLongFileMode>
- </configuration>
- </plugin>
- </plugins>
- </build>
-
- <profiles>
- <!-- profile for building against Hadoop 1.0.x
- Activate by not specifying hadoop.profile -->
- <profile>
- <id>hadoop-1.0</id>
- <activation>
- <property>
- <name>!hadoop.profile</name>
- </property>
- </activation>
- <dependencies>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-core</artifactId>
- </dependency>
- </dependencies>
- </profile>
- <!-- profile for building against Hadoop 2.0.x
- Activate using: mvn -Dhadoop.profile=2.0 -->
- <profile>
- <id>hadoop-2.0</id>
- <activation>
- <property>
- <name>hadoop.profile</name>
- <value>2.0</value>
- </property>
- </activation>
- <dependencies>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-client</artifactId>
- </dependency>
- </dependencies>
- </profile>
- </profiles>
-
-</project>
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/assembly/dist.xml
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/assembly/dist.xml b/src/examples/wikisearch/ingest/src/assembly/dist.xml
deleted file mode 100644
index e3e59c1..0000000
--- a/src/examples/wikisearch/ingest/src/assembly/dist.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<assembly>
- <id>dist</id>
- <formats>
- <format>tar.gz</format>
- </formats>
- <baseDirectory></baseDirectory>
- <fileSets>
- <fileSet>
- <directory>lib</directory>
- <fileMode>0644</fileMode>
- </fileSet>
- <fileSet>
- <directory>bin</directory>
- <fileMode>0744</fileMode>
- </fileSet>
- <fileSet>
- <directory>conf</directory>
- <fileMode>0644</fileMode>
- </fileSet>
- </fileSets>
-</assembly>
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java
deleted file mode 100644
index 0699cfa..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/ArticleExtractor.java
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.ingest;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.io.Reader;
-import java.text.ParseException;
-import java.text.SimpleDateFormat;
-import java.util.HashMap;
-import java.util.Map;
-
-import javax.xml.namespace.QName;
-import javax.xml.stream.XMLInputFactory;
-import javax.xml.stream.XMLStreamException;
-import javax.xml.stream.XMLStreamReader;
-
-import org.apache.accumulo.examples.wikisearch.normalizer.LcNoDiacriticsNormalizer;
-import org.apache.accumulo.examples.wikisearch.normalizer.NumberNormalizer;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-
-
-public class ArticleExtractor {
-
- public final static SimpleDateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'Z");
- private static NumberNormalizer nn = new NumberNormalizer();
- private static LcNoDiacriticsNormalizer lcdn = new LcNoDiacriticsNormalizer();
-
- public static class Article implements Writable {
- int id;
- String title;
- long timestamp;
- String comments;
- String text;
-
- public Article(){}
-
- private Article(int id, String title, long timestamp, String comments, String text) {
- super();
- this.id = id;
- this.title = title;
- this.timestamp = timestamp;
- this.comments = comments;
- this.text = text;
- }
-
- public int getId() {
- return id;
- }
-
- public String getTitle() {
- return title;
- }
-
- public String getComments() {
- return comments;
- }
-
- public String getText() {
- return text;
- }
-
- public long getTimestamp() {
- return timestamp;
- }
-
- public Map<String,Object> getFieldValues() {
- Map<String,Object> fields = new HashMap<String,Object>();
- fields.put("ID", this.id);
- fields.put("TITLE", this.title);
- fields.put("TIMESTAMP", this.timestamp);
- fields.put("COMMENTS", this.comments);
- return fields;
- }
-
- public Map<String,String> getNormalizedFieldValues() {
- Map<String,String> fields = new HashMap<String,String>();
- fields.put("ID", nn.normalizeFieldValue("ID", this.id));
- fields.put("TITLE", lcdn.normalizeFieldValue("TITLE", this.title));
- fields.put("TIMESTAMP", nn.normalizeFieldValue("TIMESTAMP", this.timestamp));
- fields.put("COMMENTS", lcdn.normalizeFieldValue("COMMENTS", this.comments));
- return fields;
- }
-
- @Override
- public void readFields(DataInput in) throws IOException {
- id = in.readInt();
- Text foo = new Text();
- foo.readFields(in);
- title = foo.toString();
- timestamp = in.readLong();
- foo.readFields(in);
- comments = foo.toString();
- foo.readFields(in);
- text = foo.toString();
- }
-
- @Override
- public void write(DataOutput out) throws IOException {
- out.writeInt(id);
- (new Text(title)).write(out);
- out.writeLong(timestamp);
- (new Text(comments)).write(out);
- (new Text(text)).write(out);
- }
-
- }
-
- public ArticleExtractor() {}
-
- private static XMLInputFactory xmlif = XMLInputFactory.newInstance();
-
- static
- {
- xmlif.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
- }
-
- public Article extract(Reader reader) {
-
- XMLStreamReader xmlr = null;
-
- try {
- xmlr = xmlif.createXMLStreamReader(reader);
- } catch (XMLStreamException e1) {
- throw new RuntimeException(e1);
- }
-
- QName titleName = QName.valueOf("title");
- QName textName = QName.valueOf("text");
- QName revisionName = QName.valueOf("revision");
- QName timestampName = QName.valueOf("timestamp");
- QName commentName = QName.valueOf("comment");
- QName idName = QName.valueOf("id");
-
- Map<QName,StringBuilder> tags = new HashMap<QName,StringBuilder>();
- for (QName tag : new QName[] {titleName, textName, timestampName, commentName, idName}) {
- tags.put(tag, new StringBuilder());
- }
-
- StringBuilder articleText = tags.get(textName);
- StringBuilder titleText = tags.get(titleName);
- StringBuilder timestampText = tags.get(timestampName);
- StringBuilder commentText = tags.get(commentName);
- StringBuilder idText = tags.get(idName);
-
- StringBuilder current = null;
- boolean inRevision = false;
- while (true) {
- try {
- if (!xmlr.hasNext())
- break;
- xmlr.next();
- } catch (XMLStreamException e) {
- throw new RuntimeException(e);
- }
- QName currentName = null;
- if (xmlr.hasName()) {
- currentName = xmlr.getName();
- }
- if (xmlr.isStartElement() && tags.containsKey(currentName)) {
- if (!inRevision || (!currentName.equals(revisionName) && !currentName.equals(idName))) {
- current = tags.get(currentName);
- current.setLength(0);
- }
- } else if (xmlr.isStartElement() && currentName.equals(revisionName)) {
- inRevision = true;
- } else if (xmlr.isEndElement() && currentName.equals(revisionName)) {
- inRevision = false;
- } else if (xmlr.isEndElement() && current != null) {
- if (textName.equals(currentName)) {
-
- String title = titleText.toString();
- String text = articleText.toString();
- String comment = commentText.toString();
- int id = Integer.parseInt(idText.toString());
- long timestamp;
- try {
- timestamp = dateFormat.parse(timestampText.append("+0000").toString()).getTime();
- return new Article(id, title, timestamp, comment, text);
- } catch (ParseException e) {
- return null;
- }
- }
- current = null;
- } else if (current != null && xmlr.hasText()) {
- current.append(xmlr.getText());
- }
- }
- return null;
- }
-}
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/LRUOutputCombiner.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/LRUOutputCombiner.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/LRUOutputCombiner.java
deleted file mode 100644
index 7d7b6dc..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/LRUOutputCombiner.java
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.ingest;
-
-import java.util.LinkedHashMap;
-import java.util.Map;
-
-public class LRUOutputCombiner<Key,Value> extends LinkedHashMap<Key,Value> {
-
- private static final long serialVersionUID = 1L;
-
- public static abstract class Fold<Value> {
- public abstract Value fold(Value oldValue, Value newValue);
- }
-
- public static abstract class Output<Key,Value> {
- public abstract void output(Key key, Value value);
- }
-
- private final int capacity;
- private final Fold<Value> fold;
- private final Output<Key,Value> output;
-
- private long cacheHits = 0;
- private long cacheMisses = 0;
-
- public LRUOutputCombiner(int capacity, Fold<Value> fold, Output<Key,Value> output) {
- super(capacity + 1, 1.1f, true);
- this.capacity = capacity;
- this.fold = fold;
- this.output = output;
- }
-
- protected boolean removeEldestEntry(Map.Entry<Key,Value> eldest) {
- if (size() > capacity) {
- output.output(eldest.getKey(), eldest.getValue());
- return true;
- }
- return false;
- }
-
- @Override
- public Value put(Key key, Value value) {
- Value val = get(key);
- if (val != null) {
- value = fold.fold(val, value);
- cacheHits++;
- } else {
- cacheMisses++;
- }
- super.put(key, value);
- return null;
- }
-
- public void flush() {
- for (Map.Entry<Key,Value> e : entrySet()) {
- output.output(e.getKey(), e.getValue());
- }
- clear();
- }
-}
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java
deleted file mode 100644
index 27a28a1..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaConfiguration.java
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.ingest;
-
-import java.io.IOException;
-
-import org.apache.accumulo.core.client.AccumuloException;
-import org.apache.accumulo.core.client.AccumuloSecurityException;
-import org.apache.accumulo.core.client.Connector;
-import org.apache.accumulo.core.client.Instance;
-import org.apache.accumulo.core.client.ZooKeeperInstance;
-import org.apache.commons.lang.StringUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.util.ReflectionUtils;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.SimpleAnalyzer;
-
-public class WikipediaConfiguration {
- public final static String INSTANCE_NAME = "wikipedia.accumulo.instance_name";
- public final static String USER = "wikipedia.accumulo.user";
- public final static String PASSWORD = "wikipedia.accumulo.password";
- public final static String TABLE_NAME = "wikipedia.accumulo.table";
-
- public final static String ZOOKEEPERS = "wikipedia.accumulo.zookeepers";
-
- public final static String NAMESPACES_FILENAME = "wikipedia.namespaces.filename";
- public final static String LANGUAGES_FILENAME = "wikipedia.languages.filename";
- public final static String WORKING_DIRECTORY = "wikipedia.ingest.working";
-
- public final static String ANALYZER = "wikipedia.index.analyzer";
-
- public final static String NUM_PARTITIONS = "wikipedia.ingest.partitions";
-
- public final static String NUM_GROUPS = "wikipedia.ingest.groups";
-
- public final static String PARTITIONED_ARTICLES_DIRECTORY = "wikipedia.partitioned.directory";
-
- public final static String RUN_PARTITIONER = "wikipedia.run.partitioner";
- public final static String RUN_INGEST = "wikipedia.run.ingest";
- public final static String BULK_INGEST = "wikipedia.bulk.ingest";
- public final static String BULK_INGEST_DIR = "wikipedia.bulk.ingest.dir";
- public final static String BULK_INGEST_FAILURE_DIR = "wikipedia.bulk.ingest.failure.dir";
- public final static String BULK_INGEST_BUFFER_SIZE = "wikipedia.bulk.ingest.buffer.size";
- public final static String PARTITIONED_INPUT_MIN_SPLIT_SIZE = "wikipedia.min.input.split.size";
-
-
- public static String getUser(Configuration conf) {
- return conf.get(USER);
- };
-
- public static byte[] getPassword(Configuration conf) {
- String pass = conf.get(PASSWORD);
- if (pass == null) {
- return null;
- }
- return pass.getBytes();
- }
-
- public static String getTableName(Configuration conf) {
- String tablename = conf.get(TABLE_NAME);
- if (tablename == null) {
- throw new RuntimeException("No data table name specified in " + TABLE_NAME);
- }
- return tablename;
- }
-
- public static String getInstanceName(Configuration conf) {
- return conf.get(INSTANCE_NAME);
- }
-
- public static String getZookeepers(Configuration conf) {
- String zookeepers = conf.get(ZOOKEEPERS);
- if (zookeepers == null) {
- throw new RuntimeException("No zookeepers specified in " + ZOOKEEPERS);
- }
- return zookeepers;
- }
-
- public static Path getNamespacesFile(Configuration conf) {
- String filename = conf.get(NAMESPACES_FILENAME, new Path(getWorkingDirectory(conf), "namespaces.dat").toString());
- return new Path(filename);
- }
-
- public static Path getLanguagesFile(Configuration conf) {
- String filename = conf.get(LANGUAGES_FILENAME, new Path(getWorkingDirectory(conf), "languages.txt").toString());
- return new Path(filename);
- }
-
- public static Path getWorkingDirectory(Configuration conf) {
- String filename = conf.get(WORKING_DIRECTORY);
- return new Path(filename);
- }
-
- public static Analyzer getAnalyzer(Configuration conf) throws IOException {
- Class<? extends Analyzer> analyzerClass = conf.getClass(ANALYZER, SimpleAnalyzer.class, Analyzer.class);
- return ReflectionUtils.newInstance(analyzerClass, conf);
- }
-
- public static Connector getConnector(Configuration conf) throws AccumuloException, AccumuloSecurityException {
- return getInstance(conf).getConnector(getUser(conf), getPassword(conf));
- }
-
- public static Instance getInstance(Configuration conf) {
- return new ZooKeeperInstance(getInstanceName(conf), getZookeepers(conf));
- }
-
- public static int getNumPartitions(Configuration conf) {
- return conf.getInt(NUM_PARTITIONS, 25);
- }
-
- public static int getNumGroups(Configuration conf) {
- return conf.getInt(NUM_GROUPS, 1);
- }
-
- public static Path getPartitionedArticlesPath(Configuration conf) {
- return new Path(conf.get(PARTITIONED_ARTICLES_DIRECTORY));
- }
-
- public static long getMinInputSplitSize(Configuration conf) {
- return conf.getLong(PARTITIONED_INPUT_MIN_SPLIT_SIZE, 1l << 27);
- }
-
- public static boolean runPartitioner(Configuration conf) {
- return conf.getBoolean(RUN_PARTITIONER, false);
- }
-
- public static boolean runIngest(Configuration conf) {
- return conf.getBoolean(RUN_INGEST, true);
- }
-
- public static boolean bulkIngest(Configuration conf) {
- return conf.getBoolean(BULK_INGEST, true);
- }
-
- public static String bulkIngestDir(Configuration conf) {
- return conf.get(BULK_INGEST_DIR);
- }
-
- public static String bulkIngestFailureDir(Configuration conf) {
- return conf.get(BULK_INGEST_FAILURE_DIR);
- }
-
- public static long bulkIngestBufferSize(Configuration conf) {
- return conf.getLong(BULK_INGEST_BUFFER_SIZE,1l<<28);
- }
-
- /**
- * Helper method to get properties from Hadoop configuration
- *
- * @param <T>
- * @param conf
- * @param propertyName
- * @param resultClass
- * @throws IllegalArgumentException
- * if property is not defined, null, or empty. Or if resultClass is not handled.
- * @return value of property
- */
- @SuppressWarnings("unchecked")
- public static <T> T isNull(Configuration conf, String propertyName, Class<T> resultClass) {
- String p = conf.get(propertyName);
- if (StringUtils.isEmpty(p))
- throw new IllegalArgumentException(propertyName + " must be specified");
-
- if (resultClass.equals(String.class))
- return (T) p;
- else if (resultClass.equals(String[].class))
- return (T) conf.getStrings(propertyName);
- else if (resultClass.equals(Boolean.class))
- return (T) Boolean.valueOf(p);
- else if (resultClass.equals(Long.class))
- return (T) Long.valueOf(p);
- else if (resultClass.equals(Integer.class))
- return (T) Integer.valueOf(p);
- else if (resultClass.equals(Float.class))
- return (T) Float.valueOf(p);
- else if (resultClass.equals(Double.class))
- return (T) Double.valueOf(p);
- else
- throw new IllegalArgumentException(resultClass.getSimpleName() + " is unhandled.");
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaIngester.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaIngester.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaIngester.java
deleted file mode 100644
index 50415a7..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaIngester.java
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.ingest;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.EnumSet;
-import java.util.List;
-import java.util.Set;
-import java.util.SortedSet;
-import java.util.TreeSet;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.accumulo.core.client.AccumuloException;
-import org.apache.accumulo.core.client.AccumuloSecurityException;
-import org.apache.accumulo.core.client.Connector;
-import org.apache.accumulo.core.client.IteratorSetting;
-import org.apache.accumulo.core.client.IteratorSetting.Column;
-import org.apache.accumulo.core.client.TableExistsException;
-import org.apache.accumulo.core.client.TableNotFoundException;
-import org.apache.accumulo.core.client.admin.TableOperations;
-import org.apache.accumulo.core.client.mapreduce.AccumuloOutputFormat;
-import org.apache.accumulo.core.data.Mutation;
-import org.apache.accumulo.core.iterators.IteratorUtil.IteratorScope;
-import org.apache.accumulo.core.iterators.user.SummingCombiner;
-import org.apache.accumulo.examples.wikisearch.iterator.GlobalIndexUidCombiner;
-import org.apache.accumulo.examples.wikisearch.iterator.TextIndexCombiner;
-import org.apache.accumulo.examples.wikisearch.reader.AggregatingRecordReader;
-import org.apache.commons.lang.StringUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.fs.FileStatus;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.fs.PathFilter;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.util.Tool;
-import org.apache.hadoop.util.ToolRunner;
-
-public class WikipediaIngester extends Configured implements Tool {
-
- public final static String INGEST_LANGUAGE = "wikipedia.ingest_language";
- public final static String SPLIT_FILE = "wikipedia.split_file";
- public final static String TABLE_NAME = "wikipedia.table";
-
- public static void main(String[] args) throws Exception {
- int res = ToolRunner.run(new Configuration(), new WikipediaIngester(), args);
- System.exit(res);
- }
-
- private void createTables(TableOperations tops, String tableName) throws AccumuloException, AccumuloSecurityException, TableNotFoundException,
- TableExistsException {
- // Create the shard table
- String indexTableName = tableName + "Index";
- String reverseIndexTableName = tableName + "ReverseIndex";
- String metadataTableName = tableName + "Metadata";
-
- // create the shard table
- if (!tops.exists(tableName)) {
- // Set a text index combiner on the given field names. No combiner is set if the option is not supplied
- String textIndexFamilies = WikipediaMapper.TOKENS_FIELD_NAME;
-
- tops.create(tableName);
- if (textIndexFamilies.length() > 0) {
- System.out.println("Adding content combiner on the fields: " + textIndexFamilies);
-
- IteratorSetting setting = new IteratorSetting(10, TextIndexCombiner.class);
- List<Column> columns = new ArrayList<Column>();
- for (String family : StringUtils.split(textIndexFamilies, ',')) {
- columns.add(new Column("fi\0" + family));
- }
- TextIndexCombiner.setColumns(setting, columns);
- TextIndexCombiner.setLossyness(setting, true);
-
- tops.attachIterator(tableName, setting, EnumSet.allOf(IteratorScope.class));
- }
-
- // Set the locality group for the full content column family
- tops.setLocalityGroups(tableName, Collections.singletonMap("WikipediaDocuments", Collections.singleton(new Text(WikipediaMapper.DOCUMENT_COLUMN_FAMILY))));
-
- }
-
- if (!tops.exists(indexTableName)) {
- tops.create(indexTableName);
- // Add the UID combiner
- IteratorSetting setting = new IteratorSetting(19, "UIDAggregator", GlobalIndexUidCombiner.class);
- GlobalIndexUidCombiner.setCombineAllColumns(setting, true);
- GlobalIndexUidCombiner.setLossyness(setting, true);
- tops.attachIterator(indexTableName, setting, EnumSet.allOf(IteratorScope.class));
- }
-
- if (!tops.exists(reverseIndexTableName)) {
- tops.create(reverseIndexTableName);
- // Add the UID combiner
- IteratorSetting setting = new IteratorSetting(19, "UIDAggregator", GlobalIndexUidCombiner.class);
- GlobalIndexUidCombiner.setCombineAllColumns(setting, true);
- GlobalIndexUidCombiner.setLossyness(setting, true);
- tops.attachIterator(reverseIndexTableName, setting, EnumSet.allOf(IteratorScope.class));
- }
-
- if (!tops.exists(metadataTableName)) {
- // Add the SummingCombiner with VARLEN encoding for the frequency column
- tops.create(metadataTableName);
- IteratorSetting setting = new IteratorSetting(10, SummingCombiner.class);
- SummingCombiner.setColumns(setting, Collections.singletonList(new Column("f")));
- SummingCombiner.setEncodingType(setting, SummingCombiner.Type.VARLEN);
- tops.attachIterator(metadataTableName, setting, EnumSet.allOf(IteratorScope.class));
- }
- }
-
- @Override
- public int run(String[] args) throws Exception {
- Job job = new Job(getConf(), "Ingest Wikipedia");
- Configuration conf = job.getConfiguration();
- conf.set("mapred.map.tasks.speculative.execution", "false");
-
- String tablename = WikipediaConfiguration.getTableName(conf);
-
- String zookeepers = WikipediaConfiguration.getZookeepers(conf);
- String instanceName = WikipediaConfiguration.getInstanceName(conf);
-
- String user = WikipediaConfiguration.getUser(conf);
- byte[] password = WikipediaConfiguration.getPassword(conf);
- Connector connector = WikipediaConfiguration.getConnector(conf);
-
- TableOperations tops = connector.tableOperations();
-
- createTables(tops, tablename);
-
- configureJob(job);
-
- List<Path> inputPaths = new ArrayList<Path>();
- SortedSet<String> languages = new TreeSet<String>();
- FileSystem fs = FileSystem.get(conf);
- Path parent = new Path(conf.get("wikipedia.input"));
- listFiles(parent, fs, inputPaths, languages);
-
- System.out.println("Input files in " + parent + ":" + inputPaths.size());
- Path[] inputPathsArray = new Path[inputPaths.size()];
- inputPaths.toArray(inputPathsArray);
-
- System.out.println("Languages:" + languages.size());
-
- FileInputFormat.setInputPaths(job, inputPathsArray);
-
- job.setMapperClass(WikipediaMapper.class);
- job.setNumReduceTasks(0);
- job.setMapOutputKeyClass(Text.class);
- job.setMapOutputValueClass(Mutation.class);
- job.setOutputFormatClass(AccumuloOutputFormat.class);
- AccumuloOutputFormat.setOutputInfo(job.getConfiguration(), user, password, true, tablename);
- AccumuloOutputFormat.setZooKeeperInstance(job.getConfiguration(), instanceName, zookeepers);
-
- return job.waitForCompletion(true) ? 0 : 1;
- }
-
- public final static PathFilter partFilter = new PathFilter() {
- @Override
- public boolean accept(Path path) {
- return path.getName().startsWith("part");
- };
- };
-
- protected void configureJob(Job job) {
- Configuration conf = job.getConfiguration();
- job.setJarByClass(WikipediaIngester.class);
- job.setInputFormatClass(WikipediaInputFormat.class);
- conf.set(AggregatingRecordReader.START_TOKEN, "<page>");
- conf.set(AggregatingRecordReader.END_TOKEN, "</page>");
- }
-
- protected static final Pattern filePattern = Pattern.compile("([a-z_]+).*.xml(.bz2)?");
-
- protected void listFiles(Path path, FileSystem fs, List<Path> files, Set<String> languages) throws IOException {
- for (FileStatus status : fs.listStatus(path)) {
- if (status.isDir()) {
- listFiles(status.getPath(), fs, files, languages);
- } else {
- Path p = status.getPath();
- Matcher matcher = filePattern.matcher(p.getName());
- if (matcher.matches()) {
- languages.add(matcher.group(1));
- files.add(p);
- }
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputFormat.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputFormat.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputFormat.java
deleted file mode 100644
index c582cbf..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaInputFormat.java
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.accumulo.examples.wikisearch.ingest;
-
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.accumulo.examples.wikisearch.reader.AggregatingRecordReader;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.Writable;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.JobContext;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.lib.input.FileSplit;
-import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
-
-
-public class WikipediaInputFormat extends TextInputFormat {
-
- public static class WikipediaInputSplit extends InputSplit implements Writable {
-
- public WikipediaInputSplit(){}
-
- public WikipediaInputSplit(FileSplit fileSplit, int partition)
- {
- this.fileSplit = fileSplit;
- this.partition = partition;
- }
-
- private FileSplit fileSplit = null;
- private int partition = -1;
-
- public int getPartition()
- {
- return partition;
- }
-
- public FileSplit getFileSplit()
- {
- return fileSplit;
- }
-
- @Override
- public long getLength() throws IOException, InterruptedException {
- return fileSplit.getLength();
- }
-
- @Override
- public String[] getLocations() throws IOException, InterruptedException {
- // for highly replicated files, returning all of the locations can lead to bunching
- // TODO replace this with a subset of the locations
- return fileSplit.getLocations();
- }
-
- @Override
- public void readFields(DataInput in) throws IOException {
- Path file = new Path(in.readUTF());
- long start = in.readLong();
- long length = in.readLong();
- String [] hosts = null;
- if(in.readBoolean())
- {
- int numHosts = in.readInt();
- hosts = new String[numHosts];
- for(int i = 0; i < numHosts; i++)
- hosts[i] = in.readUTF();
- }
- fileSplit = new FileSplit(file, start, length, hosts);
- partition = in.readInt();
- }
-
- @Override
- public void write(DataOutput out) throws IOException {
- out.writeUTF(fileSplit.getPath().toString());
- out.writeLong(fileSplit.getStart());
- out.writeLong(fileSplit.getLength());
- String [] hosts = fileSplit.getLocations();
- if(hosts == null)
- {
- out.writeBoolean(false);
- }
- else
- {
- out.writeBoolean(true);
- out.writeInt(hosts.length);
- for(String host:hosts)
- out.writeUTF(host);
- }
- out.writeInt(partition);
- }
-
- }
-
- @Override
- public List<InputSplit> getSplits(JobContext job) throws IOException {
- List<InputSplit> superSplits = super.getSplits(job);
- List<InputSplit> splits = new ArrayList<InputSplit>();
-
- int numGroups = WikipediaConfiguration.getNumGroups(job.getConfiguration());
-
- for(int group = 0; group < numGroups; group++)
- {
- for(InputSplit split:superSplits)
- {
- FileSplit fileSplit = (FileSplit)split;
- splits.add(new WikipediaInputSplit(fileSplit,group));
- }
- }
- return splits;
- }
-
- @Override
- public RecordReader<LongWritable,Text> createRecordReader(InputSplit split, TaskAttemptContext context) {
- return new AggregatingRecordReader();
- }
-}
http://git-wip-us.apache.org/repos/asf/accumulo/blob/8db62992/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
----------------------------------------------------------------------
diff --git a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java b/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
deleted file mode 100644
index 8565b09..0000000
--- a/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
+++ /dev/null
@@ -1,245 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- *
- */
-package org.apache.accumulo.examples.wikisearch.ingest;
-
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.StringReader;
-import java.nio.charset.Charset;
-import java.util.HashSet;
-import java.util.IllegalFormatException;
-import java.util.Map.Entry;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.accumulo.core.data.Mutation;
-import org.apache.accumulo.core.data.Value;
-import org.apache.accumulo.core.security.ColumnVisibility;
-import org.apache.accumulo.examples.wikisearch.ingest.ArticleExtractor.Article;
-import org.apache.accumulo.examples.wikisearch.ingest.WikipediaInputFormat.WikipediaInputSplit;
-import org.apache.accumulo.examples.wikisearch.normalizer.LcNoDiacriticsNormalizer;
-import org.apache.accumulo.examples.wikisearch.protobuf.Uid;
-import org.apache.accumulo.examples.wikisearch.protobuf.Uid.List.Builder;
-import org.apache.commons.codec.binary.Base64;
-import org.apache.commons.lang.StringUtils;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.mapreduce.lib.input.FileSplit;
-import org.apache.log4j.Logger;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
-import org.apache.lucene.wikipedia.analysis.WikipediaTokenizer;
-
-import com.google.common.collect.HashMultimap;
-import com.google.common.collect.Multimap;
-
-public class WikipediaMapper extends Mapper<LongWritable,Text,Text,Mutation> {
-
- private static final Logger log = Logger.getLogger(WikipediaMapper.class);
-
- public final static Charset UTF8 = Charset.forName("UTF-8");
- public static final String DOCUMENT_COLUMN_FAMILY = "d";
- public static final String METADATA_EVENT_COLUMN_FAMILY = "e";
- public static final String METADATA_INDEX_COLUMN_FAMILY = "i";
- public static final String TOKENS_FIELD_NAME = "TEXT";
-
- private final static Pattern languagePattern = Pattern.compile("([a-z_]+).*.xml(.bz2)?");
- private static final Value NULL_VALUE = new Value(new byte[0]);
- private static final String cvPrefix = "all|";
-
- private ArticleExtractor extractor;
- private String language;
- private int numPartitions = 0;
- private ColumnVisibility cv = null;
-
- private int myGroup = -1;
- private int numGroups = -1;
-
- private Text tablename = null;
- private Text indexTableName = null;
- private Text reverseIndexTableName = null;
- private Text metadataTableName = null;
-
- @Override
- public void setup(Context context) {
- Configuration conf = context.getConfiguration();
- tablename = new Text(WikipediaConfiguration.getTableName(conf));
- indexTableName = new Text(tablename + "Index");
- reverseIndexTableName = new Text(tablename + "ReverseIndex");
- metadataTableName = new Text(tablename + "Metadata");
-
- WikipediaInputSplit wiSplit = (WikipediaInputSplit)context.getInputSplit();
- myGroup = wiSplit.getPartition();
- numGroups = WikipediaConfiguration.getNumGroups(conf);
-
- FileSplit split = wiSplit.getFileSplit();
- String fileName = split.getPath().getName();
- Matcher matcher = languagePattern.matcher(fileName);
- if (matcher.matches()) {
- language = matcher.group(1).replace('_', '-').toLowerCase();
- } else {
- throw new RuntimeException("Unknown ingest language! " + fileName);
- }
- extractor = new ArticleExtractor();
- numPartitions = WikipediaConfiguration.getNumPartitions(conf);
- cv = new ColumnVisibility(cvPrefix + language);
-
- }
-
- /**
- * We will partition the documents based on the document id
- *
- * @param article
- * @param numPartitions
- * @return The number of the partition for a given article.
- * @throws IllegalFormatException
- */
- public static int getPartitionId(Article article, int numPartitions) throws IllegalFormatException {
- return article.getId() % numPartitions;
- }
-
- static HashSet<String> metadataSent = new HashSet<String>();
-
- @Override
- protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
- Article article = extractor.extract(new InputStreamReader(new ByteArrayInputStream(value.getBytes()), UTF8));
- String NULL_BYTE = "\u0000";
- String colfPrefix = language + NULL_BYTE;
- String indexPrefix = "fi" + NULL_BYTE;
- if (article != null) {
- int groupId = WikipediaMapper.getPartitionId(article, numGroups);
- if(groupId != myGroup)
- return;
- Text partitionId = new Text(Integer.toString(WikipediaMapper.getPartitionId(article, numPartitions)));
-
- // Create the mutations for the document.
- // Row is partition id, colf is language0articleid, colq is fieldName\0fieldValue
- Mutation m = new Mutation(partitionId);
- for (Entry<String,Object> entry : article.getFieldValues().entrySet()) {
- m.put(colfPrefix + article.getId(), entry.getKey() + NULL_BYTE + entry.getValue().toString(), cv, article.getTimestamp(), NULL_VALUE);
- // Create mutations for the metadata table.
- String metadataKey = entry.getKey() + METADATA_EVENT_COLUMN_FAMILY + language;
- if (!metadataSent.contains(metadataKey)) {
- Mutation mm = new Mutation(entry.getKey());
- mm.put(METADATA_EVENT_COLUMN_FAMILY, language, cv, article.getTimestamp(), NULL_VALUE);
- context.write(metadataTableName, mm);
- metadataSent.add(metadataKey);
- }
- }
-
- // Tokenize the content
- Set<String> tokens = getTokens(article);
-
- // We are going to put the fields to be indexed into a multimap. This allows us to iterate
- // over the entire set once.
- Multimap<String,String> indexFields = HashMultimap.create();
- // Add the normalized field values
- LcNoDiacriticsNormalizer normalizer = new LcNoDiacriticsNormalizer();
- for (Entry<String,String> index : article.getNormalizedFieldValues().entrySet())
- indexFields.put(index.getKey(), index.getValue());
- // Add the tokens
- for (String token : tokens)
- indexFields.put(TOKENS_FIELD_NAME, normalizer.normalizeFieldValue("", token));
-
- for (Entry<String,String> index : indexFields.entries()) {
- // Create mutations for the in partition index
- // Row is partition id, colf is 'fi'\0fieldName, colq is fieldValue\0language\0article id
- m.put(indexPrefix + index.getKey(), index.getValue() + NULL_BYTE + colfPrefix + article.getId(), cv, article.getTimestamp(), NULL_VALUE);
-
- // Create mutations for the global index
- // Create a UID object for the Value
- Builder uidBuilder = Uid.List.newBuilder();
- uidBuilder.setIGNORE(false);
- uidBuilder.setCOUNT(1);
- uidBuilder.addUID(Integer.toString(article.getId()));
- Uid.List uidList = uidBuilder.build();
- Value val = new Value(uidList.toByteArray());
-
- // Create mutations for the global index
- // Row is field value, colf is field name, colq is partitionid\0language, value is Uid.List object
- Mutation gm = new Mutation(index.getValue());
- gm.put(index.getKey(), partitionId + NULL_BYTE + language, cv, article.getTimestamp(), val);
- context.write(indexTableName, gm);
-
- // Create mutations for the global reverse index
- Mutation grm = new Mutation(StringUtils.reverse(index.getValue()));
- grm.put(index.getKey(), partitionId + NULL_BYTE + language, cv, article.getTimestamp(), val);
- context.write(reverseIndexTableName, grm);
-
- // Create mutations for the metadata table.
- String metadataKey = index.getKey() + METADATA_INDEX_COLUMN_FAMILY + language;
- if (!metadataSent.contains(metadataKey)) {
- Mutation mm = new Mutation(index.getKey());
- mm.put(METADATA_INDEX_COLUMN_FAMILY, language + NULL_BYTE + LcNoDiacriticsNormalizer.class.getName(), cv, article.getTimestamp(), NULL_VALUE);
- context.write(metadataTableName, mm);
- metadataSent.add(metadataKey);
- }
- }
- // Add the entire text to the document section of the table.
- // row is the partition, colf is 'd', colq is language\0articleid, value is Base64 encoded GZIP'd document
- m.put(DOCUMENT_COLUMN_FAMILY, colfPrefix + article.getId(), cv, article.getTimestamp(), new Value(Base64.encodeBase64(article.getText().getBytes())));
- context.write(tablename, m);
-
- } else {
- context.getCounter("wikipedia", "invalid articles").increment(1);
- }
- context.progress();
- }
-
- /**
- * Tokenize the wikipedia content
- *
- * @param article
- * @return
- * @throws IOException
- */
- static Set<String> getTokens(Article article) throws IOException {
- Set<String> tokenList = new HashSet<String>();
- WikipediaTokenizer tok = new WikipediaTokenizer(new StringReader(article.getText()));
- TermAttribute term = tok.addAttribute(TermAttribute.class);
- try {
- while (tok.incrementToken()) {
- String token = term.term();
- if (!StringUtils.isEmpty(token))
- tokenList.add(token);
- }
- } catch (IOException e) {
- log.error("Error tokenizing text", e);
- } finally {
- try {
- tok.end();
- } catch (IOException e) {
- log.error("Error calling end()", e);
- } finally {
- try {
- tok.close();
- } catch (IOException e) {
- log.error("Error closing tokenizer", e);
- }
- }
- }
- return tokenList;
- }
-
-}