You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/12/14 13:38:59 UTC

svn commit: r1421823 - in /stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8: ./ README.md copy_en_values.ldpath entityrankings.sh fetch_data_de.sh fetch_data_en_int.sh

Author: rwesten
Date: Fri Dec 14 12:38:57 2012
New Revision: 1421823

URL: http://svn.apache.org/viewvc?rev=1421823&view=rev
Log:
added some work-in-progress scrips for indexing dbpedia 3.8

Added:
    stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/
    stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/README.md
    stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/copy_en_values.ldpath
    stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/entityrankings.sh   (with props)
    stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/fetch_data_de.sh   (with props)
    stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/fetch_data_en_int.sh   (with props)

Added: stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/README.md
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/README.md?rev=1421823&view=auto
==============================================================================
--- stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/README.md (added)
+++ stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/README.md Fri Dec 14 12:38:57 2012
@@ -0,0 +1,48 @@
+DBpedia 3.8
+===========
+
+This folder contains work-in-progress scrips for
+
+* creating incoming links file for different language versions of dbpedia
+* downloading and pre-processing of dbpedia dump files
+
+entityrankings.sh
+-----------------
+
+This will compute incoming links for the parsed language. In addition it will copy over the number of incoming links of the main DBpedia page to all pages that redirect to this page.
+
+### usage
+
+    ./entityrankings.sh {lang}
+
+{lang} ... the language of the DBpedia dump to use (e.g. "en", "de" ...) 
+
+
+fetch_data_**.sh
+----------------
+
+In DBpedia 3.8 most files are affected by the UTF-8 encoding issues that causes Errors during import of the data into Jena TDB. Because of that this scripts corrects all files downloaded.
+
+The list of the downloaded files is specified in an array at the begin of the script. Users will need to edit this list based on their demands
+
+Two examples are given: (1) DBpedia index based on the english dbpedia version with international labels (2) DBpedia index for german with english labels
+
+Note that for non english DBpedia version one needs also to use the "copy_en_values.ldpath" LDPath program during indexing
+
+copy_en_values.ldpath
+---------------------
+
+This is needed when indexin non english dbpedia dumps.
+
+Make sure to copy this file into "indexing/config" and to configure 
+
+    entityProcessor=org.apache.stanbol.entityhub.indexing.core.processor.FieldValueFilter,config:entityTypes;org.apache.stanbol.entityhub.indexing.core.processor.LdpathSourceProcessor,ldpath:copy_en_values.ldpath;org.apache.stanbol.entityhub.indexing.core.processor.FiledMapperProcessor
+
+in your indexing.properties file.
+
+Depending on your use case users might also want to add additional properties to the "copy_en_values.ldpath" program
+
+The pattern is
+
+    {property} = dbp-ont:wikiPageInterLanguageLink/{property};
+

Added: stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/copy_en_values.ldpath
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/copy_en_values.ldpath?rev=1421823&view=auto
==============================================================================
--- stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/copy_en_values.ldpath (added)
+++ stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/copy_en_values.ldpath Fri Dec 14 12:38:57 2012
@@ -0,0 +1,7 @@
+rdfs:label = dbp-ont:wikiPageInterLanguageLink/rdfs:label;
+skos:altLabel = ^dbp-ont:wikiPageRedirects/rdfs:label | dbp-ont:wikiPageInterLanguageLink/^dbp-ont:wikiPageRedirects/rdfs:label;
+rdf:type = dbp-ont:wikiPageInterLanguageLink/rdf:type;
+rdfs:comment = dbp-ont:wikiPageInterLanguageLink/rdfs:comment;
+geo:lat = dbp-ont:wikiPageInterLanguageLink/geo:lat;
+geo:long = dbp-ont:wikiPageInterLanguageLink/geo:long;
+geo:alt = dbp-ont:wikiPageInterLanguageLink/geo:alt;
\ No newline at end of file

Added: stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/entityrankings.sh
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/entityrankings.sh?rev=1421823&view=auto
==============================================================================
--- stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/entityrankings.sh (added)
+++ stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/entityrankings.sh Fri Dec 14 12:38:57 2012
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo ">> Building incoming links File <<"
+
+WORKSPACE=.
+DBPEDIA=http://downloads.dbpedia.org/3.8
+
+MAX_SORT_MEM=4G
+
+# Turn on echoing and exit on error
+set -x -e -o pipefail
+
+# The language to build the index
+LANGUAGE=$1
+INCOMING_FILE=${WORKSPACE}/incoming_links_${LANGUAGE}.txt
+
+#prpair Page_Links
+PAGE_LINKS=page_links_${LANGUAGE}.nt
+PAGE_LINKS_FILE=${PAGE_LINKS}.gz
+if [ ! -f ${PAGE_LINKS_FILE} ]
+then
+    url=${DBPEDIA}/${LANGUAGE}/${PAGE_LINKS}.bz2
+    wget -c ${url}
+    echo "cleaning $PAGE_LINKS ..."
+    #corrects encoding and recompress using gz
+    bzcat ${PAGE_LINKS}.bz2 \
+        | sed 's/\\\\/\\u005c\\u005c/g;s/\\\([^u"]\)/\\u005c\1/g' \
+        | gzip -c > ${PAGE_LINKS_FILE}
+    rm -f ${PAGE_LINKS}.bz2
+fi
+
+#prpair Redirects
+REDIRECTS=redirects_${LANGUAGE}.nt
+REDIRECTS_FILE=${REDIRECTS}.gz
+
+if [ ! -f ${REDIRECTS_FILE} ]
+then
+    url=${DBPEDIA}/${LANGUAGE}/${REDIRECTS}.bz2
+    wget -c ${url}
+    echo "cleaning $REDIRECTS ..."
+    #corrects encoding and recompress using gz
+    bzcat ${REDIRECTS}.bz2 \
+        | sed 's/\\\\/\\u005c\\u005c/g;s/\\\([^u"]\)/\\u005c\1/g' \
+        | gzip -c > ${REDIRECTS_FILE}
+    rm -f ${REDIRECTS}.bz2
+fi
+
+zcat ${PAGE_LINKS_FILE} \
+| sed -e 's/.*dbpedia\.org\/resource\/\([^>]*\)> ./\1/' \
+| sort -S $MAX_SORT_MEM \
+| uniq -c  \
+| sort -nr -S $MAX_SORT_MEM > $INCOMING_FILE
+
+# Sort the incoming links on the entities, removing initial spaces added by uniq
+cat $INCOMING_FILE \
+    | sed 's/^\s*//' \
+    | sort -k 2b,2 > $WORKSPACE/incoming_links_sorted_k2.txt
+
+mv $INCOMING_FILE $WORKSPACE/original_incoming_links_${LANGUAGE}.txt
+
+# Sort redirects
+zcat redirects_en.nt.gz | grep -v "^#" \
+    | sed 's/.*dbpedia\.org\/resource\/\([^>]*\)>.*dbpedia\.org\/resource\/\([^>]*\)> ./\1 \2/' \
+    | sort -k 2b,2 > $WORKSPACE/redirects_sorted_k2.txt
+
+# Join redirects with the original incoming links to assign the
+# same ranking to redirects
+join -j 2 -o 2.1 1.1 $WORKSPACE/redirects_sorted_k2.txt $WORKSPACE/incoming_links_sorted_k2.txt \
+    > $WORKSPACE/incoming_links_redirects.txt
+
+# Merge the two files - maybe use sort merge?!
+cat $WORKSPACE/incoming_links_redirects.txt $WORKSPACE/incoming_links_sorted_k2.txt \
+    | sort -nr -S $MAX_SORT_MEM > $INCOMING_FILE
+
+# WE ARE NOT REMOVING INTERMEDIATE FILES
+# rm -f $WORKSPACE/incoming_links_sorted_k2.txt
+# rm -f $WORKSPACE/redirects_sorted_k2.txt
+# rm -f $WORKSPACE/incoming_links_redirects.txt

Propchange: stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/entityrankings.sh
------------------------------------------------------------------------------
    svn:executable = *

Added: stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/fetch_data_de.sh
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/fetch_data_de.sh?rev=1421823&view=auto
==============================================================================
--- stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/fetch_data_de.sh (added)
+++ stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/fetch_data_de.sh Fri Dec 14 12:38:57 2012
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+INDEXING_JAR=./org.apache.stanbol.entityhub.indexing.dbpedia-*-jar-with-dependencies.jar
+WORKSPACE=.
+DBPEDIA=http://downloads.dbpedia.org/3.8
+
+# Turn on echoing and exit on error
+set -x -e -o pipefail
+
+java -jar $INDEXING_JAR init
+
+# Download the RDF dumps:
+cd $WORKSPACE/indexing/resources/rdfdata
+
+# General attributes for all entities
+
+files=(dbpedia_3.8.owl \
+    en/labels_en.nt \
+    de/labels_de.nt \
+    en/short_abstracts_en.nt \
+    de/short_abstracts_de.nt \
+    en/instance_types_en.nt \
+    de/instance_types_de.nt \
+    en/images_en.nt \
+    de/images_de.nt \
+    en/geo_coordinates_en.nt \
+    de/geo_coordinates_de.nt \
+    en/redirects_en.nt \
+    de/redirects_de.nt \
+    de/mappingbased_properties_de.nt \
+    de/article_categories_de.nt \
+    )
+
+for i in "${files[@]}"
+do
+    :
+    # clean possible encoding errors
+    filename=$(basename $i)
+    if [ ! -f ${filename}.gz ]
+    then
+        url=${DBPEDIA}/${i}.bz2
+        wget -c ${url}
+        echo "cleaning $filename ..."
+        #corrects encoding and recompress using gz
+        bzcat ${filename}.bz2 \
+            | sed 's/\\\\/\\u005c\\u005c/g;s/\\\([^u"]\)/\\u005c\1/g' \
+            | gzip -c > ${filename}.gz
+        rm -f ${filename}.bz2
+    fi
+done
+
+cd ../../..
+
+set +xe
+
+# Instruction to launch the indexing
+echo "Preparation & data fetch done: edit config in $WORKSPACE/indexing/config/"
+echo "Then launch indexing command:"
+echo "(cd $WORKSPACE && java -jar $INDEXING_JAR index)"
+

Propchange: stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/fetch_data_de.sh
------------------------------------------------------------------------------
    svn:executable = *

Added: stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/fetch_data_en_int.sh
URL: http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/fetch_data_en_int.sh?rev=1421823&view=auto
==============================================================================
--- stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/fetch_data_en_int.sh (added)
+++ stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/fetch_data_en_int.sh Fri Dec 14 12:38:57 2012
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+INDEXING_JAR=./org.apache.stanbol.entityhub.indexing.dbpedia-*-jar-with-dependencies.jar
+WORKSPACE=.
+DBPEDIA=http://downloads.dbpedia.org/3.8
+
+# Turn on echoing and exit on error
+set -x -e -o pipefail
+
+java -jar $INDEXING_JAR init
+
+# Download the RDF dumps:
+cd $WORKSPACE/indexing/resources/rdfdata
+
+# General attributes for all entities
+
+files=(dbpedia_3.8.owl \
+    en/labels_en.nt \
+    en/short_abstracts_en.nt \
+    en/long_abstracts_en.nt \
+    en/instance_types_en.nt \
+    en/images_en.nt \
+    en/geo_coordinates_en.nt \
+    en/redirects_en.nt \
+    en/page_links_en.nt \
+    en/mappingbased_properties_en.nt \
+    de/labels_en_uris_ar.nt \
+    ar/labels_en_uris_ar.nt es/labels_en_uris_es.nt fr/labels_en_uris_fr.nt \
+    he/labels_en_uris_he.nt it/labels_en_uris_it.nt ja/labels_en_uris_ja.nt \
+    ru/labels_en_uris_ru.nt tr/labels_en_uris_tr.nt nl/labels_en_uris_nl.nt \
+    zh/labels_en_uris_zh.nt pt/labels_en_uris_pt.nt sv/labels_en_uris_sv.nt \
+    da/labels_en_uris_da.nt \
+    de/short_abstracts_en_uris_de.nt es/short_abstracts_en_uris_es.nt \
+    fr/short_abstracts_en_uris_fr.nt ar/short_abstracts_en_uris_ar.nt \
+    zh/short_abstracts_en_uris_zh.nt it/short_abstracts_en_uris_it.nt \
+    de/long_abstracts_en_uris_de.nt it/long_abstracts_en_uris_it.nt \
+    es/long_abstracts_en_uris_es.nt fr/long_abstracts_en_uris_fr.nt \
+    en/skos_categories_en.nt \
+    en/article_categories_en.nt \
+    )
+
+for i in "${files[@]}"
+do
+    :
+    # clean possible encoding errors
+    filename=$(basename $i)
+    if [ ! -f ${filename}.gz ]
+    then
+        url=${DBPEDIA}/${i}.bz2
+        wget -c ${url}
+        echo "cleaning $filename ..."
+        #corrects encoding and recompress using gz
+        bzcat ${filename}.bz2 \
+            | sed 's/\\\\/\\u005c\\u005c/g;s/\\\([^u"]\)/\\u005c\1/g' \
+            | gzip -c > ${filename}.gz
+        rm -f ${filename}.bz2
+    fi
+done
+
+cd ../../..
+
+set +xe
+
+# Instruction to launch the indexing
+echo "Preparation & data fetch done: edit config in $WORKSPACE/indexing/config/"
+echo "Then launch indexing command:"
+echo "(cd $WORKSPACE && java -jar $INDEXING_JAR index)"
+

Propchange: stanbol/trunk/entityhub/indexing/dbpedia/dbpedia-3.8/fetch_data_en_int.sh
------------------------------------------------------------------------------
    svn:executable = *