You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by wi...@apache.org on 2014/12/17 14:44:28 UTC

svn commit: r1646234 - in /stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-2014: ./ README.md fetch_data.sh

Author: wikier
Date: Wed Dec 17 13:44:28 2014
New Revision: 1646234

URL: http://svn.apache.org/r1646234
Log:
added basic bash infrastructure to download dbpedia 2014 dumps

Added:
    stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-2014/
    stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-2014/README.md
    stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-2014/fetch_data.sh   (with props)

Added: stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-2014/README.md
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-2014/README.md?rev=1646234&view=auto
==============================================================================
--- stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-2014/README.md (added)
+++ stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-2014/README.md Wed Dec 17 13:44:28 2014
@@ -0,0 +1,30 @@
+<!-- 
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+DBpedia 2014
+============
+
+This folder contains work-in-progress scrips for downloading dbpedia dump files.
+
+fetch_data.sh
+-------------
+
+This script fetches all data from the [DBpedia download server](http://data.dws.informatik.uni-mannheim.de/dbpedia/2014/).
+The list of the downloaded files is specified in an array at the begin of the 
+script, templatized by language. Users may need to edit this list based on their 
+demands.
+

Added: stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-2014/fetch_data.sh
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-2014/fetch_data.sh?rev=1646234&view=auto
==============================================================================
--- stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-2014/fetch_data.sh (added)
+++ stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-2014/fetch_data.sh Wed Dec 17 13:44:28 2014
@@ -0,0 +1,112 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+DBPEDIA=http://data.dws.informatik.uni-mannheim.de/dbpedia/2014
+
+# Turn on echoing and exit on error
+set -x -e -o pipefail
+
+# General attributes for all entities
+langs=( \
+    ar \
+    da \
+    de \
+    es \
+    fi \
+    fr \
+    he \
+    hr \
+    hi \
+    hu \
+    it \
+    ja \
+    nl \
+    no \
+    pl \
+    pt \
+    ru \
+    sv \
+    tr \
+    zh \
+    )
+
+files_lang=( \
+    %LANG%/labels_%LANG%.ttl \
+    %LANG%/labels_en_uris_%LANG%.ttl \
+    %LANG%/long_abstracts_%LANG%.ttl \
+    %LANG%/long_abstracts_en_uris_%LANG%.ttl \
+    %LANG%/short_abstracts_%LANG%.ttl \
+    %LANG%/short_abstracts_en_uris_%LANG%.ttl \
+    %LANG%/instance_types_%LANG%.ttl \
+    %LANG%/images_%LANG%.ttl \
+    %LANG%/geo_coordinates_%LANG%.ttl \
+    %LANG%/mappingbased_properties_%LANG%.ttl \
+    %LANG%/homepages_en_uris_%LANG%.ttl \
+    %LANG%/homepages_%LANG%.ttl \
+    %LANG%/raw_infobox_properties_%LANG%.ttl \
+    %LANG%/article_categories_%LANG%.ttl \
+    %LANG%/article_categories_en_uris_%LANG%.ttl \
+    %LANG%/skos_categories_%LANG%.ttl \
+    %LANG%/skos_categories_en_uris_%LANG%.ttl \
+    )
+
+files=( \
+    en/labels_en.ttl \
+    en/instance_types_en.ttl \
+    en/images_en.ttl \
+    en/geo_coordinates_en.ttl \
+    en/mappingbased_properties_en.ttl \
+    en/long_abstracts_en.ttl \
+    en/short_abstracts_en.ttl \
+    en/homepages_en.ttl \
+    en/raw_infobox_properties_en.ttl \
+    en/article_categories_en.ttl \
+    en/skos_categories_en.ttl \
+
+    links/yago_types.ttl \
+    links/yago_type_links.ttl \
+    links/yago_taxonomy.ttl \
+    links/yago_links.ttl \
+    links/freebase_links.nt \
+    links/geonames_links.ttl \
+    links/musicbrainz_links.nt \
+    links/openei_links.nt \
+    links/nytimes_links.nt \
+    links/factbook_links.nt \
+    links/eurostat_wbsg_links.nt \
+    links/eurostat_linkedstatistics_links.nt \
+    links/gutenberg_links.nt 
+    
+    )
+
+rm -f urls.txt
+touch urls.txt
+
+for lang in "${langs[@]}"; do
+    for i in "${files_lang[@]}"; do
+	f="${i//'%LANG%'/$lang}"
+	echo ${DBPEDIA}/$f.bz2 >> urls.txt
+    done
+done
+for f in "${files[@]}"; do
+    echo ${DBPEDIA}/$f.bz2 >> urls.txt
+done
+
+wget -c -i urls.txt
+
+set +xe
+

Propchange: stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-2014/fetch_data.sh
------------------------------------------------------------------------------
    svn:executable = *