You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2011/02/17 13:00:20 UTC

svn commit: r1071590 [3/3] - in /incubator/stanbol/trunk/entityhub: ./ generic/servicesapi/src/main/java/org/apache/stanbol/entityhub/servicesapi/defaults/ indexing/dblp/ indexing/dblp/solrConf/ indexing/dblp/solrConf/dblp/ indexing/dblp/solrConf/dblp/...

Added: incubator/stanbol/trunk/entityhub/indexing/dblp/src/main/java/org/apache/stanbol/entityhub/indexing/dblp/cli/CommandLineRunner.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/dblp/src/main/java/org/apache/stanbol/entityhub/indexing/dblp/cli/CommandLineRunner.java?rev=1071590&view=auto
==============================================================================
--- incubator/stanbol/trunk/entityhub/indexing/dblp/src/main/java/org/apache/stanbol/entityhub/indexing/dblp/cli/CommandLineRunner.java (added)
+++ incubator/stanbol/trunk/entityhub/indexing/dblp/src/main/java/org/apache/stanbol/entityhub/indexing/dblp/cli/CommandLineRunner.java Thu Feb 17 12:00:19 2011
@@ -0,0 +1,325 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.entityhub.indexing.dblp.cli;
+
+import static org.apache.stanbol.entityhub.indexing.rdf.RdfIndexer.KEY_CHUNK_SIZE;
+import static org.apache.stanbol.entityhub.indexing.rdf.RdfIndexer.KEY_ENTITY_RANKINGS;
+import static org.apache.stanbol.entityhub.indexing.rdf.RdfIndexer.KEY_FIELD_MAPPINGS;
+import static org.apache.stanbol.entityhub.indexing.rdf.RdfIndexer.KEY_INDEXING_MODE;
+import static org.apache.stanbol.entityhub.indexing.rdf.RdfIndexer.KEY_MODEL_NAME;
+import static org.apache.stanbol.entityhub.indexing.rdf.RdfIndexer.KEY_RDF_FILES;
+import static org.apache.stanbol.entityhub.indexing.rdf.RdfIndexer.KEY_RDF_STORE_DIR;
+import static org.apache.stanbol.entityhub.indexing.rdf.RdfIndexer.KEY_REQUIRED_ENTITY_RANKING;
+import static org.apache.stanbol.entityhub.indexing.rdf.RdfIndexer.KEY_RESUME_MODE;
+import static org.apache.stanbol.entityhub.indexing.rdf.RdfIndexer.KEY_SKIP_READ;
+import static org.apache.stanbol.entityhub.indexing.rdf.RdfIndexer.KEY_YARD;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.HashMap;
+import java.util.Hashtable;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.TreeMap;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.cli.PosixParser;
+import org.apache.commons.io.IOUtils;
+import org.apache.stanbol.entityhub.indexing.rdf.RdfIndexer;
+import org.apache.stanbol.entityhub.indexing.rdf.RdfIndexer.IndexingMode;
+import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
+import org.apache.stanbol.entityhub.servicesapi.model.rdf.RdfResourceEnum;
+import org.apache.stanbol.entityhub.servicesapi.yard.YardException;
+import org.apache.stanbol.entityhub.yard.solr.impl.SolrYard;
+import org.apache.stanbol.entityhub.yard.solr.impl.SolrYardConfig;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+public class CommandLineRunner {
+    public static final String[] defaultFieldMappings = new String [] {
+        // --- Define the Languages for all fields ---
+        //NOTE: the leading space is required for the global filter!
+        
+        // --- RDF, RDFS and OWL Mappings ---
+        "rdfs:label", //rdfs:label
+        "rdfs:comment",//rdfs:comment
+        "rdfs:seeAlso | d=entityhub:ref",
+        "rdf:type | d=entityhub:ref",//The types
+        "owl:sameAs | d=entityhub:ref",//used by LOD to link to URIs used to identify the same Entity
+        // --- Dublin Core ---
+        "dc:*", //all DC Terms properties
+        "dc-elements:*", //to keep the "old" dc-element triples 
+        "dc-elements:contributor > dc:contributor",
+        "dc-elements:coverage > dc:coverage",
+        "dc-elements:creator > dc:creator",
+        "dc-elements:date > dc:date",
+        "dc-elements:description > dc:description",
+        "dc-elements:format > dc:format",
+        "dc-elements:identifier > dc:identifier",
+        "dc-elements:language > dc:language",
+        "dc-elements:publisher > dc:publisher",
+        "dc-elements:relation > dc:relation",
+        "dc-elements:rights > dc:rights",
+        "dc-elements:source > dc:source",
+        "dc-elements:subject > dc:subject",
+        "dc-elements:title > dc:title",
+        "dc-elements:type > dc:type",
+        // --- Social Networks (via foaf) ---
+        "foaf:*", //The Friend of a Friend schema often used to describe social relations between people
+        "foaf:knows | d=entityhub:ref",
+        "foaf:made | d=entityhub:ref",
+        "foaf:maker | d=entityhub:ref",
+        "foaf:member | d=entityhub:ref",
+        "foaf:homepage | d=xsd:anyURI",
+        "foaf:depiction | d=xsd:anyURI",
+        "foaf:img | d=xsd:anyURI",
+        "foaf:logo | d=xsd:anyURI",
+        "foaf:page | d=xsd:anyURI", //page about the entity
+        // --- The SWRC (Semantic Web for Research Communities) Ontology
+        "swrc:*" //select all
+    };
+    public static final Map<String,Float> fieldBoosts;
+    static {
+        Map<String,Float> boosts = new HashMap<String, Float>();
+        boosts.put(NamespaceEnum.rdfs+"label", 3f);
+        boosts.put(NamespaceEnum.dcTerms+"title", 3f);
+        boosts.put(NamespaceEnum.dcElements+"title", 3f);
+        boosts.put(NamespaceEnum.foaf+"name", 3f);
+        fieldBoosts = Collections.unmodifiableMap(boosts);
+    }
+
+    protected static final Logger log = LoggerFactory.getLogger(CommandLineRunner.class);
+
+    private static final String header;
+    static {
+        StringBuilder builder = new StringBuilder();
+        builder.append("Description:\nThis Utility creates a full Yard for DBLP Computer Science Bibliography by using the SolrYard implementation.\n");
+        builder.append("\nParameter:\n");
+        builder.append(" - \"-Xmx\": This implementation does not need much memory. RDF data are loaded into the file based Jena TDB store. Indexing is done in chunks of 1000 (default). In case of OutOfMemory errors you need to increase this value!");
+        builder.append(" - solrServerUri : The URL of the Solr Server used to index the data. Make sure to use the schema.xml as needed by the SolrYard!\n");
+        builder.append(" - RDF dump: The relative or absolute path to the Rdf Dump or the Dir containing the source RDF files to be used for indexing. You can direcly use the compressed archives. In case you point to an Directory, all files in that directory are used to create the index\n");
+        builder.append("\nOptions:\n");
+        header = builder.toString();
+        builder = null;
+    }
+    private static final Options options;
+    static {
+        options = new Options();
+        options.addOption("h", "help", false, "display this help and exit");
+        options.addOption("d", "debug", false, "show debug stacktrace upon error");
+        //options.addOption("yt","yardtype",false, "the type of the yard used as target 'solr' or 'rdf' (default:'solr')");
+        //options.addOption("i","index",true, "Base URI of the used Solr Server used to index the data");
+        options.addOption("n", "name", true, "the id and name used for the Yard (default: 'dblp')");
+        options.addOption("m","mappings",true, "file with the fieldMappings used for indexing (this will replace the defaults)");
+        options.addOption("c","chunksize",true, "the number of documents stored in one chunk (default: 1000");
+        options.addOption("s","skipRdf",false, "this options allow to skip the loading of the RDF Data (e.g. if alredy loaded to the Triple Store)");
+        options.addOption("i","incomming",true,"the file with the incomming links for Entities (id tab num, highest num needs to be the first line!)");
+        options.addOption("ri","requiredIncomming",true,"the minimum number of incomming lins for Entities to be indexed");
+        options.addOption("r","resume",true,"resume a previous canceled indexing session (usually used with -s)");
+    }
+    private static final String footer;
+    static {
+        StringBuilder builder = new StringBuilder();
+        builder.append("Default Field Mappings:\n");
+        for(String mapping: defaultFieldMappings){
+            builder.append(String.format("\t%s",mapping));
+        }
+        footer = builder.toString();
+        builder = null;
+    }
+
+    private static float minRequiredRanking;
+
+    public static void main(String[] args) throws IOException, ParseException, YardException {
+        CommandLineParser parser = new PosixParser();
+        CommandLine line = parser.parse(options, args);
+        args = line.getArgs();
+
+        if (line.getArgs().length < 2 || line.hasOption("h")) {
+            HelpFormatter formatter = new HelpFormatter();
+            formatter.printHelp(
+                    "java -Xmx1024M -jar org.apache.stanbol.indexing.dbPedia-0.1-SNAPSHOT-jar-with-dependencies.jar [options] solrServerUri rdfDump",
+                    header,
+                    options,
+                    footer);
+            System.exit(0);
+        }
+        String yardName = line.getOptionValue("n");
+        if(yardName == null){
+            yardName = "dblp";
+        }
+        Dictionary<String, Object> indexingConfig = new Hashtable<String, Object>();
+        //first the SolrServer used to store the index
+        SolrYardConfig yardConfig = new SolrYardConfig(yardName, line.getArgs()[0]);
+        //use the signRank as default for document Boosts
+        yardConfig.setDocumentBoostFieldName(RdfResourceEnum.signRank.getUri());
+        //increase the boost for fields that are usually used as labels
+        yardConfig.setFieldBoosts(fieldBoosts);
+        SolrYard yard = new SolrYard(yardConfig);
+        indexingConfig.put(KEY_YARD, yard);
+        //now the other properties
+        File dataFileOrDir = new File(line.getArgs()[1]);
+        if(!dataFileOrDir.exists()){
+            log.error("Parsed Data Directory "+dataFileOrDir+" does not Exist on the File System");
+            System.exit(0);
+        }
+//also allow to parse a single file!
+//        if(!dataDir.isDirectory()){
+//            log.error("Parsed Data Directory "+dataDir+" exists, but is not a Directory!");
+//            System.exit(0);
+//        }
+        if(!dataFileOrDir.canRead()){
+            log.error("Unable to read Data Directory "+dataFileOrDir+"!");
+            System.exit(0);
+        }
+        File[] files;
+        if(dataFileOrDir.isDirectory()){
+            files = dataFileOrDir.listFiles();
+        } else { //isFile()
+            files = new File[] {dataFileOrDir};
+        }
+        indexingConfig.put(KEY_RDF_FILES, Arrays.asList(files));
+        indexingConfig.put(KEY_RDF_STORE_DIR, yardName+"-rdf-data");
+        Integer chunkSize;
+        try {
+            chunkSize = Integer.valueOf(line.getOptionValue("c", "1000"));
+        } catch (NumberFormatException e) {
+            throw new IllegalArgumentException("Value for option \"chunkSize\" need to be a valid Integer");
+        }
+        if(chunkSize<0){
+            log.warn("Negative number parsed for option \"chunkSize\". Use '1000' as default.");
+            chunkSize = 1000;
+        }
+        if(!line.hasOption("m")){
+            indexingConfig.put(KEY_FIELD_MAPPINGS, Arrays.asList(defaultFieldMappings));
+        } else {
+            File fieldMappingFile = new File(line.getOptionValue("m"));
+            if(fieldMappingFile.exists() && fieldMappingFile.isFile() && fieldMappingFile.canRead()){
+                String[] fieldMappings = IOUtils.toString(new FileInputStream(fieldMappingFile)).split("\n");
+                indexingConfig.put(KEY_FIELD_MAPPINGS, Arrays.asList(fieldMappings));
+            } else {
+                log.error(String.format("Invalid fieldMapping File (exist: %s | isFile: %s | canRead: %s)",
+                        fieldMappingFile.exists(),fieldMappingFile.isFile(),fieldMappingFile.canRead()));
+                System.exit(0);
+            }
+
+        }
+        if(line.hasOption("s")){
+            indexingConfig.put(KEY_SKIP_READ, Boolean.TRUE);
+        }
+        indexingConfig.put(KEY_CHUNK_SIZE, chunkSize);
+        indexingConfig.put(KEY_MODEL_NAME, "indexingModel-49e176b9-0138-dd4c-2b87-89af85b89a57");
+        //entityRank related properties
+        if(line.hasOption("i")){
+            File tsvScoreFile = new File(line.getOptionValue("i"));
+            if(tsvScoreFile.exists() && tsvScoreFile.isFile() && tsvScoreFile.canRead()){
+                int minIncommings = -1;
+                try {
+                    minIncommings = Integer.parseInt(line.getOptionValue("ri", "-1"));
+                } catch (Exception e) {
+                    log.error("Value of option --minIncomming/-mi MUST BE a valid integer");
+                    System.exit(0);
+                }
+                Map<String,Float> entityRankings = clacEntityRanks(tsvScoreFile,minIncommings);
+                indexingConfig.put(KEY_ENTITY_RANKINGS, entityRankings);
+                log.info(String.format(" ... set min required score to %s (represents %s incomming links",minRequiredRanking,minIncommings));
+                indexingConfig.put(KEY_REQUIRED_ENTITY_RANKING, minRequiredRanking);
+            } else {
+                log.error(String.format("Parsed File with the incommung links is invalid (esists:%s,isFile:%s,canRead:%s)",
+                        tsvScoreFile.exists(), tsvScoreFile.isFile(), tsvScoreFile.canRead()));
+            }
+        } else {
+            if(line.hasOption("ri")){
+                log.warn("Option --requiredIncomming/-ri is only supported of Option --incomming/-i is active!");
+            }
+        }
+        //THis mode uses the id of the entity rnking map as main lookup for
+        //entities to index. This is faster than the usual mode if less than
+        //50% of the entities are indexed!
+        if(line.hasOption("r")){
+            //resume makes only really sense with the RANKING BASED MODE
+            indexingConfig.put(KEY_INDEXING_MODE, IndexingMode.RANKING_MAP_BASED);
+            //set the RESUME MODE
+            indexingConfig.put(KEY_RESUME_MODE, Boolean.TRUE);
+        }
+        RdfIndexer indexer = new RdfIndexer(indexingConfig);
+        indexer.index();
+    }
+    @SuppressWarnings("unchecked")
+    private static Map<String,Float> clacEntityRanks(File tsvScoreFile,int minIncommings) throws IOException {
+        TreeMap<String,Float> entityRankings = new TreeMap<String, Float>();
+        final Iterator<String> lines = IOUtils.lineIterator(
+                new FileInputStream(tsvScoreFile), "utf-8");
+        long lineNumber = 0;
+        int maxIncomming = 0;
+        float maxScore = 0;
+        long filtered  = 0;
+        log.info("  ... init Entity Ranks based on "+tsvScoreFile);
+        while (lines.hasNext()) {
+            String line = lines.next();
+            String[] parts = line.split("\t");
+            if (parts.length != 2) {
+                log.warn(String.format("skipping line: '%s'", line));
+            }
+            int incomming = Integer.parseInt(parts[1].trim());
+            // take the log to avoid over popular entities to
+            // dominate the results (attenuate the Zipf law of
+            // culturally generated distribution)
+            float score = (float)Math.log1p(incomming);
+            if (lineNumber == 0 && score > 0) {
+                maxIncomming = incomming;
+                maxScore = score;
+                if(minIncommings <= 0){
+                    minRequiredRanking = -1f; //deactivate
+                } else {
+                    float min = (float)Math.log1p(minIncommings);
+                    minRequiredRanking = min/maxScore;
+                    if(minRequiredRanking > 1){
+                        log.error("Parsed minimum required incomming links is bigger than the highest number of incomming links for any entity!");
+                        System.exit(0);
+                    }
+                }
+            }
+            score = score/maxScore;
+            if(score > 1){
+                log.error("Found Entity wiht more incomming links than the entity in the first line");
+                log.error("current:"+line);
+                System.exit(0);
+            }
+            if(score >= minRequiredRanking){
+                entityRankings.put(parts[0], score);
+            } else {
+                filtered ++;
+            }
+            lineNumber++;
+        }
+        log.info(String.format("  ... processed %s entities (%s with ranking > required | %s filtered",
+                lineNumber,lineNumber-filtered,filtered));
+        return entityRankings;
+    }
+
+}

Propchange: incubator/stanbol/trunk/entityhub/indexing/dblp/src/main/java/org/apache/stanbol/entityhub/indexing/dblp/cli/CommandLineRunner.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: incubator/stanbol/trunk/entityhub/indexing/genericRdf/src/main/java/org/apache/stanbol/entityhub/indexing/rdf/RdfIndexer.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/genericRdf/src/main/java/org/apache/stanbol/entityhub/indexing/rdf/RdfIndexer.java?rev=1071590&r1=1071589&r2=1071590&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/indexing/genericRdf/src/main/java/org/apache/stanbol/entityhub/indexing/rdf/RdfIndexer.java (original)
+++ incubator/stanbol/trunk/entityhub/indexing/genericRdf/src/main/java/org/apache/stanbol/entityhub/indexing/rdf/RdfIndexer.java Thu Feb 17 12:00:19 2011
@@ -535,7 +535,8 @@ public class RdfIndexer {
                     }
                     if(source != null){
                         if(processRanking(source)){
-                            if(resumeMode && yard.isRepresentation(source.getId())){ //resume mode check
+                            //add if !resummode or the representation is not yet in the yard
+                            if(!resumeMode || !yard.isRepresentation(source.getId())){
                                 //log.info("S<source Resource:\n"+ModelUtils.getRepresentationInfo(source));
                                 indexed++;
                                 indexedStdCount=indexedStdCount+repStdCount;
@@ -725,7 +726,12 @@ public class RdfIndexer {
      * @param is
      * @param name
      */
-    private void importRdfData(InputStream is, String name) {
+    private void importRdfData(InputStream is, String parsedName) {
+        String name = parsedName;
+        if(name.startsWith(".")){
+            log.info(" > Ignore hidden file "+parsedName+"!");
+            return;
+        }
         if (name.endsWith(".gz")) {
             try {
                 is = new GZIPInputStream(is);
@@ -756,7 +762,9 @@ public class RdfIndexer {
 //            format = Lang.RDFXML;
 //        }
         //For N-Triple we can use the TDBLoader
-        if(format == Lang.NTRIPLES){
+        if(format == null){
+            log.warn(" > ignore File with unknown extension "+parsedName);
+        } else if(format == Lang.NTRIPLES){
             TDBLoader.load(indexingDataset, is,true);
         } else if(format != Lang.RDFXML){
             //use RIOT to parse the format but with a special configuration
@@ -842,7 +850,7 @@ public class RdfIndexer {
      * last possible opportunity :(
      */
     private void writeCacheBaseConfiguration() throws YardException {
-        log.info("Write BaseMappings for geonames.org Cache");
+        log.info("Write BaseMappings for Cache");
         if(mapper != null){
             CacheUtils.storeBaseMappingsConfiguration(yard, mapper);
         }

Modified: incubator/stanbol/trunk/entityhub/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/pom.xml?rev=1071590&r1=1071589&r2=1071590&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/pom.xml (original)
+++ incubator/stanbol/trunk/entityhub/pom.xml Thu Feb 17 12:00:19 2011
@@ -65,5 +65,7 @@
     <!-- Utils for createing local caches (indexing utils) -->
     <module>indexing/geonames</module>
     <module>indexing/genericRdf</module>
+    <module>indexing/dbPedia</module>
+    <module>indexing/dblp</module>
   </modules>
 </project>