You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by sf...@apache.org on 2010/12/12 16:13:37 UTC
svn commit: r1044832 [2/14] - in /incubator/stanbol/trunk/rick:
indexing/dbPedia/src/main/java/eu/iksproject/rick/indexing/dbPedia/cli/
indexing/genericRdf/src/main/java/eu/iksproject/rick/indexing/rdf/
indexing/geonames/src/main/java/eu/iksproject/ric...
Modified: incubator/stanbol/trunk/rick/indexing/genericRdf/src/main/java/eu/iksproject/rick/indexing/rdf/RdfIndexer.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/rick/indexing/genericRdf/src/main/java/eu/iksproject/rick/indexing/rdf/RdfIndexer.java?rev=1044832&r1=1044831&r2=1044832&view=diff
==============================================================================
--- incubator/stanbol/trunk/rick/indexing/genericRdf/src/main/java/eu/iksproject/rick/indexing/rdf/RdfIndexer.java (original)
+++ incubator/stanbol/trunk/rick/indexing/genericRdf/src/main/java/eu/iksproject/rick/indexing/rdf/RdfIndexer.java Sun Dec 12 15:13:35 2010
@@ -68,7 +68,7 @@ import eu.iksproject.rick.servicesapi.ya
/**
* This Class indexes Entities based on Information provided in a RDF Graph
- *
+ *
* Features (currently Brainstorming)<ul>
* <li> Parse also Archive Files (nobody likes to extract such stuff)
* <li> Parse different RDF formats (esay with Clerezza)
@@ -78,7 +78,7 @@ import eu.iksproject.rick.servicesapi.ya
* working with dumps that split up data not by entity but by properties
* <li> Support for the RICK Representation Mapping Infrastructure (currently
* this means using the {@link FieldMapper}
- * <li> Support for filtering Entities based on rdf:type (will be in future
+ * <li> Support for filtering Entities based on rdf:type (will be in future
* version supported by the RICK Representation Mapping Infrastructure)
* <li> Entity Rank support: It is not feasible to calculate the rankings in a
* generic fashion. Therefore this implementation supports two different
@@ -99,7 +99,7 @@ import eu.iksproject.rick.servicesapi.ya
* <li> I am a bit worried with the ParsingProvide} because the interface
* allows no streaming. One needs to check if this causes problems for
* very big RDF files.<br>
- * After looking into the source code of Clerezza, I am even more worried!
+ * After looking into the source code of Clerezza, I am even more worried!
* The Parser creates an SimpleMGraph and wrapped it with an
* Jena Adapter. Than all Triples are parsed into memory. So I do not only
* have the triples in memory but also the instances of the jena adapter.
@@ -112,379 +112,379 @@ import eu.iksproject.rick.servicesapi.ya
* Parsing and especially SPARQL queries would be better to do directly
* via the Jena API!<p>
* This should be no problem, because indexing is a read only operation!
- * <li> However based on this findings I plan now to implement the loading of the
+ * <li> However based on this findings I plan now to implement the loading of the
* RDF data to directly use the Jena TDB API because Clerezza seams not to
- * be designed to handle the loading of RDF datasets that can not be
+ * be designed to handle the loading of RDF datasets that can not be
* kept in memory.<br>
* </ul>
- *
+ *
* @author Rupert Westenthaler
* @author ogrisel (as parts of that code is taken from iks-autotagger)
*
*/
public class RdfIndexer {
- /**
- * The indexing mode defines if the RDF data are appended to existing
- * {@link Representation}s in the target {@link Yard} or if {@link Representation}
- * are replaced with RDF data used for indexing!
- * @author Rupert Westenthaler
- *
- */
- public static enum IndexingMode{ REPLACE, APPEND }
-
- public static final String RDF_XML = "application/rdf+xml";
- public static final String TURTLE = "text/turtle";
- public static final String X_TURTLE = "application/x-turtle";
- public static final String N_TRIPLE = "text/rdf+nt";
- public static final String N3 = "text/rdf+n3";
- public static final String RDF_JSON = "application/rdf+json";
- //both html and xhtml can be rdf formats with RDFa
- public static final String XHTML = "application/xhtml+xml";
- public static final String HTML = "text/html";
-
+ /**
+ * The indexing mode defines if the RDF data are appended to existing
+ * {@link Representation}s in the target {@link Yard} or if {@link Representation}
+ * are replaced with RDF data used for indexing!
+ * @author Rupert Westenthaler
+ *
+ */
+ public static enum IndexingMode{ REPLACE, APPEND }
+
+ public static final String RDF_XML = "application/rdf+xml";
+ public static final String TURTLE = "text/turtle";
+ public static final String X_TURTLE = "application/x-turtle";
+ public static final String N_TRIPLE = "text/rdf+nt";
+ public static final String N3 = "text/rdf+n3";
+ public static final String RDF_JSON = "application/rdf+json";
+ //both html and xhtml can be rdf formats with RDFa
+ public static final String XHTML = "application/xhtml+xml";
+ public static final String HTML = "text/html";
+
protected static final String resourceQuery;
- static {
- StringBuilder cqb = new StringBuilder();
- cqb.append("SELECT ?field ?value ");//, count(distinct ?incoming) AS ?count ");
- cqb.append("{ ");
- cqb.append(" <%s> ?field ?value .");
-// cqb.append(" OPTIONAL {?incoming ?relationship <%s> . } .");
- cqb.append("} ");
- resourceQuery = cqb.toString();
- }
-
- Logger log = LoggerFactory.getLogger(RdfIndexer.class);
- /**
- * Key used to parse the Yard used for indexing
- */
- public static final String KEY_YARD = "eu.iksproject.rick.indexing.yard";
- /**
- * Key used to parse reference(s) to the RDF files to be indexed!<p>
- * This supports both single values as well as {@link Iterable}s over several
- * values. All parsed sources are loaded within one TripleStore and are
- * indexed at once! Use several {@link RdfIndexer} instances to index them
- * one after the other.
- */
- public static final String KEY_RDF_FILES = "eu.iksproject.rick.indexing.rdf.rdfFiles";
- /**
- * Key used to configure the fieldMappings used to determine what properties
- * are indexed for entities. Values must implement {@link Iterable} and the
- * {@link Object#toString()} is used to parse the different mappings.
- */
- public static final String KEY_FIELD_MAPPINGS = "eu.iksproject.rick.indexing.rdf.fieldMappings";
-
- /**
- * Key used to configure the directory to store RDF data needed during the
- * indexing process. This data might be reused when resuming an indexing
- * process.
- */
- public static final String KEY_RDF_STORE_DIR = "eu.iksproject.rick.indexing.rdf.indexingDir";
- /**
- * Key used to configure the name of the model used to store the parsed
- * RDF data before the indexing process. Parsing this name can be used to
- * resume indexing based on previously parsed RDF data.
- */
- public static final String KEY_MODEL_NAME = "eu.iksproject.rick.indexing.rdf.modelName";
- /**
- * Key used to parse the Iterable over all the rdf:types to be indexed. If
- * not parsed or set to <code>null</code> or an empty list, than all Resources are
- * accepted!
- * The {@link Object#toString()} method is used on elements to get the actual type!
- */
- public static final String KEY_RDF_TYPES = "eu.iksproject.rick.indexing.rdf.indexedTypes";
- /**
- * Key used to parse the indexing mode. Values should be of instance {@link IndexingMode}
- * or the {@link Object#toString()} value should be a member of this enum!
- */
- public static final String KEY_INDEXING_MODE = "eu.iksproject.rick.indexing.rdf.indexingMode";
- /**
- * If <code>true</code> than no RDF data are loaded. Instead it is assumed, that
- * the Graph of the parsed {@link #KEY_MODEL_NAME} already contains all the needed
- * data!<p>
- * This can be useful if one first wants to index rdf:type A and than rdf:type B
- * based on the same set of data
- */
- public static final String KEY_SKIP_READ = "eu.iksproject.rick.indexing.rdf.skipRead";
- /**
- * The number of {@link Representation}s stored at once in the SolrYard!
- */
- public static final String KEY_CHUNK_SIZE = "eu.iksproject.rick.indexing.rdf.chunkSize";
- /**
- * Can be used to parse a map with {@link String} entity id, {@link Float} rank
- * for entities.<p>
- * Such values are added to Representations for the {@link RdfResourceEnum#signRank}
- * field.
- */
- public static final String KEY_ENTITY_RANKINGS = "eu.iksproject.rick.indexing.rdf.entityRankings";
- /**
- * Can be used to activate ignoring of Entities without a page rank
- */
- public static final String KEY_IGNORE_ENTITIES_WITHOUT_ENTITY_RANKING = "eu.iksproject.rick.indexing.rdf.ignoreEntitiesWithoutRankings";
- /**
- * If set to a value >= 0 this is used to exclude Entities with a lower or
- * missing entity rank
- */
- public static final String KEY_REQUIRED_ENTITY_RANKING = "eu.iksproject.rick.indexing.rdf.requiredRanking";
- /**
- * The rank for entities with a missing rank. This takes only effect if
- * {@link #KEY_IGNORE_ENTITIES_WITHOUT_ENTITY_RANKING} is set to <code>false</code>
- * (the default)
- */
- public static final String KEY_DEFAULT_ENTITY_RANKING = "eu.iksproject.rick.indexing.rdf.defaultRanking";
- /**
- * Expert only: This allows to enable indexing based on the keys in the map parsed with the
- * entity rankings. This will only index entities that are keys in that map.
- * If no Map is parsed by {@link #KEY_ENTITY_RANKINGS}, than activating this mode
- * will not be successful and a warning will be written.<p>
- * This mode is about 50% slower than the usual indexing mode. Therefore this
- * mode makes only sense id less than 50% of the entities are indexed.
- */
- public static final String KEY_ENTITY_RANKING_BASED_INDEXING_MODE = "eu.iksproject.rick.indexing.rdf.rankingBasedIndexingMode";
- /**
- * The resume Mode first checks if a Resource is already present in the parsed
- * Yard. If this is the case, than the representation is not indexes again.<p>
- * This mode is intended to resume indexing after stopping a previous call before
- * finished. The default value = false.
- */
- public static final String KEY_RESUME_MODE = "eu.iksproject.rick.indexing.rdf.resumeMode";
-
- private final IndexingMode indexingMode;
- private final Yard yard;
- private final ValueFactory vf;
- private final List<File> rdfFiles;
- private final File indexingDir;
- private final String modelName;
-// private final ParsingProvider parser = new JenaParserProvider();
- //private final WeightedTcProvider provider;
- private final FieldMapper mapper;
- private final Set<String> types;
- //private MGraph indexingGraph;
- private final DatasetGraphTDB indexingDataset;
- private final boolean skipRead;
- private Location modelLocation;
- private int indexingChunkSize = 1000;
-
- //vars for entity rankings
- private Map<String,Float> entityRankings = null;
- private boolean ignoreEntitiesWithoutRank = false;
- private float defaultEntityRanking = -1;
- private String entityRankingField = RdfResourceEnum.signRank.getUri();
- private float minimumRequiredEntityRanking = -1;
- private boolean rankingMode;
- private boolean resumeMode;
-
-
- public RdfIndexer(Dictionary<String, Object> config){
- this.yard = (Yard)config.get(KEY_YARD);
- if(yard == null){
- throw new IllegalArgumentException("Parsed config MUST CONTAIN a Yard. Use the key "+KEY_YARD+" to parse the YardInstance used to store the geonames.org index!");
- } else {
- log.info(String.format("Using Yard %s (id=%s) to index parsed RDF data",
- yard.getName(),yard.getId()));
- }
- this.vf = yard.getValueFactory();
- Object rdfFiles = config.get(KEY_RDF_FILES);
- if(rdfFiles instanceof Iterable<?>){
- this.rdfFiles = new ArrayList<File>();
- for(Object value : (Iterable<?>)rdfFiles){
- this.rdfFiles.add(checkFile(value.toString()));
- }
- } else {
- this.rdfFiles = Collections.singletonList(checkFile(rdfFiles.toString()));
- }
- Object indexingDir = config.get(KEY_RDF_STORE_DIR);
- if(indexingDir == null){
- indexingDir = "indexingData";
- config.put(KEY_RDF_STORE_DIR, indexingDir);
- }
- this.indexingDir = checkFile(indexingDir.toString(), false, true);
- Object modelName = config.get(KEY_MODEL_NAME);
- if(modelName == null){
- modelName = "indexingModel-"+ModelUtils.randomUUID().toString();
- config.put(KEY_MODEL_NAME, modelName);
- }
- this.modelName = modelName.toString();
- //init the types!
- Iterable<?> types = (Iterable<?>)config.get(KEY_RDF_TYPES);
- if(types != null){
- Set<String> typeSet = new HashSet<String>();
- for(Object type : types){
- if(type != null){
- typeSet.add(type.toString());
- log.info(" - adding Resoures with rdf:type "+type);
- }
- }
- if(typeSet.isEmpty()){
- log.info(" - adding all Types (no rdf:type based restriction for RDF Reseource present)");
- this.types = null;
- } else {
- this.types = typeSet;
- }
- } else{
- log.info(" - adding all Types (no rdf:type based restriction for RDF Reseource present)");
- this.types = null; //null or an iterable with one or more elements!
- }
- //init the indexing mode
- Object indexingMode = config.get(KEY_INDEXING_MODE);
- if(indexingMode == null){
- this.indexingMode = IndexingMode.REPLACE; //default to replace
- } else if(indexingMode instanceof IndexingMode){
- this.indexingMode = (IndexingMode)indexingMode;
- } else {
- try {
- this.indexingMode = IndexingMode.valueOf(indexingMode.toString());
- }catch (IllegalArgumentException e) {
- //catch and re-throw with a better message!
- throw new IllegalArgumentException(
- String.format("Values of KEY \"%s\" MUST BE of Type %s or the toString() value MUST BE a member of this Enumeration. If the Key is missing %s is used!",
- KEY_INDEXING_MODE,IndexingMode.class,IndexingMode.REPLACE),e);
- }
- }
- //init the fieldMapper
- Iterable<?> mappings = (Iterable<?>)config.get(KEY_FIELD_MAPPINGS);
- List<FieldMapping> fieldMappings;
- if(mappings != null){
- fieldMappings = new ArrayList<FieldMapping>();
- for(Object mappingString : mappings){
- if(mappingString != null){
- FieldMapping fieldMapping = FieldMappingUtils.parseFieldMapping(mappingString.toString());
- if(fieldMapping != null){
- fieldMappings.add(fieldMapping);
- }
- }
- }
- if(!fieldMappings.isEmpty()){
- this.mapper = new DefaultFieldMapperImpl(ValueConverterFactory.getInstance(vf));
- for(FieldMapping mapping : fieldMappings){
- mapper.addMapping(mapping);
- }
- //we need to add a mapping for the field rankings (if a mapper is present)
- mapper.addMapping(new FieldMapping(this.entityRankingField));
- } else {
- this.mapper = null;
- }
- } else {
- this.mapper = null;
- }
- File modelDir = new File(this.indexingDir,this.modelName);
- if(!modelDir.exists()){
- modelDir.mkdir();
- } else if(!modelDir.isDirectory()){
- throw new IllegalStateException(String.format("A directory for %s already exists but is not a directory!",modelDir.getAbsoluteFile()));
- } //else exists and is a dir -> nothing to do
- Object skipRead = config.get(KEY_SKIP_READ);
- if(skipRead != null){
- if(skipRead instanceof Boolean){
- this.skipRead = ((Boolean)skipRead).booleanValue();
- } else {
- this.skipRead = Boolean.parseBoolean(skipRead.toString());
- }
- } else {
- this.skipRead = false;
- }
- Integer chunkSize = (Integer)config.get(KEY_CHUNK_SIZE);
- if(chunkSize != null && chunkSize>0){
- this.indexingChunkSize = chunkSize;
- } //else use default value of 1000
-
- this.modelLocation = new Location(modelDir.getAbsolutePath());
- this.indexingDataset = TDBFactory.createDatasetGraph(modelLocation) ;
- //this.provider = new IndexingModelProvider(this.indexingDir);
-
- //init entity Ranking
- try{
- this.entityRankings = (Map<String,Float>)config.get(KEY_ENTITY_RANKINGS);
- }catch (RuntimeException e) {
- log.error("Parsed Entity Rankings MUST use the form Map<String,Float>");
- System.exit(0);
- }
- Object ignore = config.get(KEY_IGNORE_ENTITIES_WITHOUT_ENTITY_RANKING);
- if(ignore != null){
- if(ignore instanceof Boolean){
- this.ignoreEntitiesWithoutRank = (Boolean)ignore;
- } else {
- this.ignoreEntitiesWithoutRank = Boolean.parseBoolean(ignore.toString());
- }
- }
- Object defaultRankingObject = config.get(KEY_DEFAULT_ENTITY_RANKING);
- if(defaultRankingObject != null){
- float defaultranking = -1;
- if(defaultRankingObject instanceof Float){
- defaultranking = (Float)defaultRankingObject;
- } else {
- try {
- defaultranking = Float.parseFloat(defaultRankingObject.toString());
- } catch (Exception e) {
- log.error("Unable to parse Float value for the Default Entity Ranking from the value parsed for the KEY_DEFAULT_ENTITY_RANKING key (value: "+defaultRankingObject+")");
- System.exit(0);
- }
- }
- this.defaultEntityRanking = defaultranking;
- }
- Object minimumRequiredRankingObject = config.get(KEY_REQUIRED_ENTITY_RANKING);
- if(minimumRequiredRankingObject != null){
- float minRanking = -1;
- if(minimumRequiredRankingObject instanceof Float){
- minRanking = (Float)minimumRequiredRankingObject;
- } else {
- try {
- minRanking = Float.parseFloat(minimumRequiredRankingObject.toString());
- } catch (Exception e) {
- log.error("Unable to parse Float value for the Minimum Required Entity Ranking from the value parsed for the KEY_DEFAULT_ENTITY_RANKING key (value: "+minimumRequiredRankingObject+")");
- System.exit(0);
- }
- }
- if(minRanking>=0){ //setting a valid required ranking automatically
- //means that entities without a rank should be ignored!
- this.ignoreEntitiesWithoutRank = true;
- }
- this.minimumRequiredEntityRanking = minRanking;
- }
- Object rankingMode = config.get(KEY_ENTITY_RANKING_BASED_INDEXING_MODE);
- if(rankingMode != null){
- if(rankingMode instanceof Boolean){
- this.rankingMode = (Boolean)rankingMode;
- } else {
- this.rankingMode = Boolean.parseBoolean(rankingMode.toString());
- }
- }
- if(this.rankingMode && this.entityRankings == null){
- log.warn("The Entity Ranking based Indexing Mode can not be activated if no EntityRankings are parsed! -> deactivate Ranking Mode (intertes over all Resources in the RDF Data)");
- this.rankingMode = false;
- }
- Object resumeMode = config.get(KEY_RESUME_MODE);
- if(resumeMode != null) {
- if(resumeMode instanceof Boolean){
- this.resumeMode = (Boolean)resumeMode;
- } else {
- this.resumeMode = Boolean.parseBoolean(resumeMode.toString());
- }
- } else {
- this.resumeMode = false;
- }
- }
- public void index() throws YardException{
- log.info("initialize ...");
- if(!skipRead){
- loadRdfFiles();
- } else {
- log.info(" ... skiping loading of RDF data");
- }
- if(rankingMode){
- indexRanked();
- } else {
- indexResources();
- }
- writeCacheBaseConfiguration();
- }
- /**
- * This Method is used to process the RDF Data if all Resource can be indexed,
- * because it provides the best performance. Mainly because it reads everything
- * from a single stream and therefore gives the OS the best opportunities to
- * optimise file access.
- * @throws YardException
- */
- private void indexResources() throws YardException{
+ static {
+ StringBuilder cqb = new StringBuilder();
+ cqb.append("SELECT ?field ?value ");//, count(distinct ?incoming) AS ?count ");
+ cqb.append("{ ");
+ cqb.append(" <%s> ?field ?value .");
+// cqb.append(" OPTIONAL {?incoming ?relationship <%s> . } .");
+ cqb.append("} ");
+ resourceQuery = cqb.toString();
+ }
+
+ Logger log = LoggerFactory.getLogger(RdfIndexer.class);
+ /**
+ * Key used to parse the Yard used for indexing
+ */
+ public static final String KEY_YARD = "eu.iksproject.rick.indexing.yard";
+ /**
+ * Key used to parse reference(s) to the RDF files to be indexed!<p>
+ * This supports both single values as well as {@link Iterable}s over several
+ * values. All parsed sources are loaded within one TripleStore and are
+ * indexed at once! Use several {@link RdfIndexer} instances to index them
+ * one after the other.
+ */
+ public static final String KEY_RDF_FILES = "eu.iksproject.rick.indexing.rdf.rdfFiles";
+ /**
+ * Key used to configure the fieldMappings used to determine what properties
+ * are indexed for entities. Values must implement {@link Iterable} and the
+ * {@link Object#toString()} is used to parse the different mappings.
+ */
+ public static final String KEY_FIELD_MAPPINGS = "eu.iksproject.rick.indexing.rdf.fieldMappings";
+
+ /**
+ * Key used to configure the directory to store RDF data needed during the
+ * indexing process. This data might be reused when resuming an indexing
+ * process.
+ */
+ public static final String KEY_RDF_STORE_DIR = "eu.iksproject.rick.indexing.rdf.indexingDir";
+ /**
+ * Key used to configure the name of the model used to store the parsed
+ * RDF data before the indexing process. Parsing this name can be used to
+ * resume indexing based on previously parsed RDF data.
+ */
+ public static final String KEY_MODEL_NAME = "eu.iksproject.rick.indexing.rdf.modelName";
+ /**
+ * Key used to parse the Iterable over all the rdf:types to be indexed. If
+ * not parsed or set to <code>null</code> or an empty list, than all Resources are
+ * accepted!
+ * The {@link Object#toString()} method is used on elements to get the actual type!
+ */
+ public static final String KEY_RDF_TYPES = "eu.iksproject.rick.indexing.rdf.indexedTypes";
+ /**
+ * Key used to parse the indexing mode. Values should be of instance {@link IndexingMode}
+ * or the {@link Object#toString()} value should be a member of this enum!
+ */
+ public static final String KEY_INDEXING_MODE = "eu.iksproject.rick.indexing.rdf.indexingMode";
+ /**
+ * If <code>true</code> than no RDF data are loaded. Instead it is assumed, that
+ * the Graph of the parsed {@link #KEY_MODEL_NAME} already contains all the needed
+ * data!<p>
+ * This can be useful if one first wants to index rdf:type A and than rdf:type B
+ * based on the same set of data
+ */
+ public static final String KEY_SKIP_READ = "eu.iksproject.rick.indexing.rdf.skipRead";
+ /**
+ * The number of {@link Representation}s stored at once in the SolrYard!
+ */
+ public static final String KEY_CHUNK_SIZE = "eu.iksproject.rick.indexing.rdf.chunkSize";
+ /**
+ * Can be used to parse a map with {@link String} entity id, {@link Float} rank
+ * for entities.<p>
+ * Such values are added to Representations for the {@link RdfResourceEnum#signRank}
+ * field.
+ */
+ public static final String KEY_ENTITY_RANKINGS = "eu.iksproject.rick.indexing.rdf.entityRankings";
+ /**
+ * Can be used to activate ignoring of Entities without a page rank
+ */
+ public static final String KEY_IGNORE_ENTITIES_WITHOUT_ENTITY_RANKING = "eu.iksproject.rick.indexing.rdf.ignoreEntitiesWithoutRankings";
+ /**
+ * If set to a value >= 0 this is used to exclude Entities with a lower or
+ * missing entity rank
+ */
+ public static final String KEY_REQUIRED_ENTITY_RANKING = "eu.iksproject.rick.indexing.rdf.requiredRanking";
+ /**
+ * The rank for entities with a missing rank. This takes only effect if
+ * {@link #KEY_IGNORE_ENTITIES_WITHOUT_ENTITY_RANKING} is set to <code>false</code>
+ * (the default)
+ */
+ public static final String KEY_DEFAULT_ENTITY_RANKING = "eu.iksproject.rick.indexing.rdf.defaultRanking";
+ /**
+ * Expert only: This allows to enable indexing based on the keys in the map parsed with the
+ * entity rankings. This will only index entities that are keys in that map.
+ * If no Map is parsed by {@link #KEY_ENTITY_RANKINGS}, than activating this mode
+ * will not be successful and a warning will be written.<p>
+ * This mode is about 50% slower than the usual indexing mode. Therefore this
+ * mode makes only sense id less than 50% of the entities are indexed.
+ */
+ public static final String KEY_ENTITY_RANKING_BASED_INDEXING_MODE = "eu.iksproject.rick.indexing.rdf.rankingBasedIndexingMode";
+ /**
+ * The resume Mode first checks if a Resource is already present in the parsed
+ * Yard. If this is the case, than the representation is not indexes again.<p>
+ * This mode is intended to resume indexing after stopping a previous call before
+ * finished. The default value = false.
+ */
+ public static final String KEY_RESUME_MODE = "eu.iksproject.rick.indexing.rdf.resumeMode";
+
+ private final IndexingMode indexingMode;
+ private final Yard yard;
+ private final ValueFactory vf;
+ private final List<File> rdfFiles;
+ private final File indexingDir;
+ private final String modelName;
+// private final ParsingProvider parser = new JenaParserProvider();
+ //private final WeightedTcProvider provider;
+ private final FieldMapper mapper;
+ private final Set<String> types;
+ //private MGraph indexingGraph;
+ private final DatasetGraphTDB indexingDataset;
+ private final boolean skipRead;
+ private Location modelLocation;
+ private int indexingChunkSize = 1000;
+
+ //vars for entity rankings
+ private Map<String,Float> entityRankings = null;
+ private boolean ignoreEntitiesWithoutRank = false;
+ private float defaultEntityRanking = -1;
+ private String entityRankingField = RdfResourceEnum.signRank.getUri();
+ private float minimumRequiredEntityRanking = -1;
+ private boolean rankingMode;
+ private boolean resumeMode;
+
+
+ public RdfIndexer(Dictionary<String, Object> config){
+ this.yard = (Yard)config.get(KEY_YARD);
+ if(yard == null){
+ throw new IllegalArgumentException("Parsed config MUST CONTAIN a Yard. Use the key "+KEY_YARD+" to parse the YardInstance used to store the geonames.org index!");
+ } else {
+ log.info(String.format("Using Yard %s (id=%s) to index parsed RDF data",
+ yard.getName(),yard.getId()));
+ }
+ this.vf = yard.getValueFactory();
+ Object rdfFiles = config.get(KEY_RDF_FILES);
+ if(rdfFiles instanceof Iterable<?>){
+ this.rdfFiles = new ArrayList<File>();
+ for(Object value : (Iterable<?>)rdfFiles){
+ this.rdfFiles.add(checkFile(value.toString()));
+ }
+ } else {
+ this.rdfFiles = Collections.singletonList(checkFile(rdfFiles.toString()));
+ }
+ Object indexingDir = config.get(KEY_RDF_STORE_DIR);
+ if(indexingDir == null){
+ indexingDir = "indexingData";
+ config.put(KEY_RDF_STORE_DIR, indexingDir);
+ }
+ this.indexingDir = checkFile(indexingDir.toString(), false, true);
+ Object modelName = config.get(KEY_MODEL_NAME);
+ if(modelName == null){
+ modelName = "indexingModel-"+ModelUtils.randomUUID().toString();
+ config.put(KEY_MODEL_NAME, modelName);
+ }
+ this.modelName = modelName.toString();
+ //init the types!
+ Iterable<?> types = (Iterable<?>)config.get(KEY_RDF_TYPES);
+ if(types != null){
+ Set<String> typeSet = new HashSet<String>();
+ for(Object type : types){
+ if(type != null){
+ typeSet.add(type.toString());
+ log.info(" - adding Resoures with rdf:type "+type);
+ }
+ }
+ if(typeSet.isEmpty()){
+ log.info(" - adding all Types (no rdf:type based restriction for RDF Reseource present)");
+ this.types = null;
+ } else {
+ this.types = typeSet;
+ }
+ } else{
+ log.info(" - adding all Types (no rdf:type based restriction for RDF Reseource present)");
+ this.types = null; //null or an iterable with one or more elements!
+ }
+ //init the indexing mode
+ Object indexingMode = config.get(KEY_INDEXING_MODE);
+ if(indexingMode == null){
+ this.indexingMode = IndexingMode.REPLACE; //default to replace
+ } else if(indexingMode instanceof IndexingMode){
+ this.indexingMode = (IndexingMode)indexingMode;
+ } else {
+ try {
+ this.indexingMode = IndexingMode.valueOf(indexingMode.toString());
+ }catch (IllegalArgumentException e) {
+ //catch and re-throw with a better message!
+ throw new IllegalArgumentException(
+ String.format("Values of KEY \"%s\" MUST BE of Type %s or the toString() value MUST BE a member of this Enumeration. If the Key is missing %s is used!",
+ KEY_INDEXING_MODE,IndexingMode.class,IndexingMode.REPLACE),e);
+ }
+ }
+ //init the fieldMapper
+ Iterable<?> mappings = (Iterable<?>)config.get(KEY_FIELD_MAPPINGS);
+ List<FieldMapping> fieldMappings;
+ if(mappings != null){
+ fieldMappings = new ArrayList<FieldMapping>();
+ for(Object mappingString : mappings){
+ if(mappingString != null){
+ FieldMapping fieldMapping = FieldMappingUtils.parseFieldMapping(mappingString.toString());
+ if(fieldMapping != null){
+ fieldMappings.add(fieldMapping);
+ }
+ }
+ }
+ if(!fieldMappings.isEmpty()){
+ this.mapper = new DefaultFieldMapperImpl(ValueConverterFactory.getInstance(vf));
+ for(FieldMapping mapping : fieldMappings){
+ mapper.addMapping(mapping);
+ }
+ //we need to add a mapping for the field rankings (if a mapper is present)
+ mapper.addMapping(new FieldMapping(this.entityRankingField));
+ } else {
+ this.mapper = null;
+ }
+ } else {
+ this.mapper = null;
+ }
+ File modelDir = new File(this.indexingDir,this.modelName);
+ if(!modelDir.exists()){
+ modelDir.mkdir();
+ } else if(!modelDir.isDirectory()){
+ throw new IllegalStateException(String.format("A directory for %s already exists but is not a directory!",modelDir.getAbsoluteFile()));
+ } //else exists and is a dir -> nothing to do
+ Object skipRead = config.get(KEY_SKIP_READ);
+ if(skipRead != null){
+ if(skipRead instanceof Boolean){
+ this.skipRead = ((Boolean)skipRead).booleanValue();
+ } else {
+ this.skipRead = Boolean.parseBoolean(skipRead.toString());
+ }
+ } else {
+ this.skipRead = false;
+ }
+ Integer chunkSize = (Integer)config.get(KEY_CHUNK_SIZE);
+ if(chunkSize != null && chunkSize>0){
+ this.indexingChunkSize = chunkSize;
+ } //else use default value of 1000
+
+ this.modelLocation = new Location(modelDir.getAbsolutePath());
+ this.indexingDataset = TDBFactory.createDatasetGraph(modelLocation) ;
+ //this.provider = new IndexingModelProvider(this.indexingDir);
+
+ //init entity Ranking
+ try{
+ this.entityRankings = (Map<String,Float>)config.get(KEY_ENTITY_RANKINGS);
+ }catch (RuntimeException e) {
+ log.error("Parsed Entity Rankings MUST use the form Map<String,Float>");
+ System.exit(0);
+ }
+ Object ignore = config.get(KEY_IGNORE_ENTITIES_WITHOUT_ENTITY_RANKING);
+ if(ignore != null){
+ if(ignore instanceof Boolean){
+ this.ignoreEntitiesWithoutRank = (Boolean)ignore;
+ } else {
+ this.ignoreEntitiesWithoutRank = Boolean.parseBoolean(ignore.toString());
+ }
+ }
+ Object defaultRankingObject = config.get(KEY_DEFAULT_ENTITY_RANKING);
+ if(defaultRankingObject != null){
+ float defaultranking = -1;
+ if(defaultRankingObject instanceof Float){
+ defaultranking = (Float)defaultRankingObject;
+ } else {
+ try {
+ defaultranking = Float.parseFloat(defaultRankingObject.toString());
+ } catch (Exception e) {
+ log.error("Unable to parse Float value for the Default Entity Ranking from the value parsed for the KEY_DEFAULT_ENTITY_RANKING key (value: "+defaultRankingObject+")");
+ System.exit(0);
+ }
+ }
+ this.defaultEntityRanking = defaultranking;
+ }
+ Object minimumRequiredRankingObject = config.get(KEY_REQUIRED_ENTITY_RANKING);
+ if(minimumRequiredRankingObject != null){
+ float minRanking = -1;
+ if(minimumRequiredRankingObject instanceof Float){
+ minRanking = (Float)minimumRequiredRankingObject;
+ } else {
+ try {
+ minRanking = Float.parseFloat(minimumRequiredRankingObject.toString());
+ } catch (Exception e) {
+ log.error("Unable to parse Float value for the Minimum Required Entity Ranking from the value parsed for the KEY_DEFAULT_ENTITY_RANKING key (value: "+minimumRequiredRankingObject+")");
+ System.exit(0);
+ }
+ }
+ if(minRanking>=0){ //setting a valid required ranking automatically
+ //means that entities without a rank should be ignored!
+ this.ignoreEntitiesWithoutRank = true;
+ }
+ this.minimumRequiredEntityRanking = minRanking;
+ }
+ Object rankingMode = config.get(KEY_ENTITY_RANKING_BASED_INDEXING_MODE);
+ if(rankingMode != null){
+ if(rankingMode instanceof Boolean){
+ this.rankingMode = (Boolean)rankingMode;
+ } else {
+ this.rankingMode = Boolean.parseBoolean(rankingMode.toString());
+ }
+ }
+ if(this.rankingMode && this.entityRankings == null){
+ log.warn("The Entity Ranking based Indexing Mode can not be activated if no EntityRankings are parsed! -> deactivate Ranking Mode (intertes over all Resources in the RDF Data)");
+ this.rankingMode = false;
+ }
+ Object resumeMode = config.get(KEY_RESUME_MODE);
+ if(resumeMode != null) {
+ if(resumeMode instanceof Boolean){
+ this.resumeMode = (Boolean)resumeMode;
+ } else {
+ this.resumeMode = Boolean.parseBoolean(resumeMode.toString());
+ }
+ } else {
+ this.resumeMode = false;
+ }
+ }
+ public void index() throws YardException{
+ log.info("initialize ...");
+ if(!skipRead){
+ loadRdfFiles();
+ } else {
+ log.info(" ... skiping loading of RDF data");
+ }
+ if(rankingMode){
+ indexRanked();
+ } else {
+ indexResources();
+ }
+ writeCacheBaseConfiguration();
+ }
+ /**
+ * This Method is used to process the RDF Data if all Resource can be indexed,
+ * because it provides the best performance. Mainly because it reads everything
+ * from a single stream and therefore gives the OS the best opportunities to
+ * optimise file access.
+ * @throws YardException
+ */
+ private void indexResources() throws YardException{
StringBuilder qb = new StringBuilder();
/*
* NOTES:
@@ -498,7 +498,7 @@ public class RdfIndexer {
*/
qb.append("SELECT ?resource ?field ?value");
qb.append("{ ");
- qb.append(" ?resource ?field ?value . ");
+ qb.append(" ?resource ?field ?value . ");
// qb.append(" OPTIONAL { ?incoming ?relationship ?resource . } . ");
//qb.append(" FILTER ( isURI(?resource) ) . ");
qb.append("} ");
@@ -516,475 +516,475 @@ public class RdfIndexer {
long start = System.currentTimeMillis();
long startCurrent = start;
String current = null;
- Representation source = null;
- while(resultSet.hasNext()){
- stdCount++;
- repStdCount++;
- QuerySolution solution =resultSet.next();
- RDFNode subject = solution.get("resource");
- if(subject.isURIResource()){
- String resource = subject.asResource().toString();
- if(!resource.equals(current)){ //start of next resource -> index current
- count++;
- if(count%10000==0){
- long thisOne = System.currentTimeMillis()-startCurrent;
- long all = System.currentTimeMillis()-start;
- log.info(String.format("processed %d resources (%dall %dlast indexed) in %dms (%sms/last | avg: %sms/indexed) std/resource (%s indexed| %s non indexed)",
- count,indexed,indexed-lastIndexed,thisOne,(float)thisOne/(indexed-lastIndexed),(float)all/indexed,(float)indexedStdCount/indexed,((float)stdCount-indexedStdCount)/(count-indexed)));
- startCurrent = System.currentTimeMillis();
- lastIndexed = indexed;
- }
- if(source != null){
- if(processRanking(source)){
- if(resumeMode && yard.isRepresentation(source.getId())){ //resume mode check
- //log.info("S<source Resource:\n"+ModelUtils.getRepresentationInfo(source));
- indexed++;
- indexedStdCount=indexedStdCount+repStdCount;
- storeRepresentation(source);
- //here we need todo the indexing!
- } //else already indexed -> nothing to do
- } // else rankging to low -> do not index
- } //else the first item to index -> ignore
- //init next resource
- source = vf.createRepresentation(resource);
- current = resource;
- repStdCount = 0;
- }
- RDFNode fieldNode = solution.get("field");
- if(fieldNode.isURIResource()){
- String field = fieldNode.asResource().getURI();
- RDFNode value = solution.get("value");
- if(value.isURIResource()){
- source.addReference(field, value.asResource().getURI());
- } else if(value.isLiteral()){
- Literal literal = value.asLiteral();
- if(literal.getDatatype() != null){
- Object literalValue;
- try {
- literalValue = literal.getValue();
- } catch (DatatypeFormatException e) {
- log.warn(" Unable to convert "+literal.getLexicalForm()+" to "+literal.getDatatype()+"-> use lecicalForm");
- literalValue = literal.getLexicalForm();
- }
- if(literalValue instanceof BaseDatatype.TypedValue){
- source.add(field, ((BaseDatatype.TypedValue)literalValue).lexicalValue);
- } else if(literalValue instanceof XSDDateTime) {
- source.add(field, ((XSDDateTime)literalValue).asCalendar().getTime()); //Rick uses the time
- } else if(literalValue instanceof XSDDuration) {
- source.add(field, literalValue.toString());
- } else {
- source.add(field, literalValue);
- }
- } else {
- String lang = literal.getLanguage();
- if(lang != null && lang.isEmpty()){
- lang = null;
- }
- source.addNaturalText(field, literal.getLexicalForm(),lang);
- }
- }
- }
- } else {
- log.warn(String.format("Current Subject %s is not a URI Resource -> ignored",subject));
- }
- } //end while
- long end = System.currentTimeMillis();
- log.info(String.format("%d in %dms (%sms/item | %sstd/resource)",count,end-start,""+((float)end - start)/count,""+(float)stdCount/count));
- }
-
-
- private boolean processRanking(Representation source) {
- Float ranking = entityRankings == null ? null :entityRankings.get(source.getId());
- //ignore values lower than 0
- if(ranking != null){
- if(ranking < 0){
- ranking = null;
- }
- }
- if(ranking != null && ranking > 1){
- log.warn("Parse Ranking Map contains Entity Ranking > 1 (ranking="+ranking+") for Entity "+source.getId()+" -> use 1.0 as Ranking!");
- ranking = 1f;
- }
- if(ranking == null){
- for(Iterator<Object> values =source.get(entityRankingField);values.hasNext() && ranking == null;){
- Object value = values.next();
- if(value instanceof Float){
- ranking = (Float) value;
- } else {
- try {
- ranking = Float.parseFloat(value.toString());
- } catch (NumberFormatException e) {
- log.warn(String.format("Unable to parse the Entity Ranking from field %s=%s[type=%s] -> The Document Boost MUST BE a Float value!",entityRankingField,value,value.getClass()));
- }
- }
- }
- } else {
- source.set(entityRankingField, ranking);
- }
- if(ranking != null && ranking > 1){
- log.warn("Parse RDF data include a entity ranking > 1 (ranking="+ranking+") for Entity "+source.getId()+" and Field "+entityRankingField+"-> use 1.0 as Ranking!");
- ranking = 1f;
- }
- if(ranking == null && this.defaultEntityRanking >= 0){
- //set to default
- ranking = defaultEntityRanking;
- source.set(entityRankingField, ranking);
- }
- if(ranking == null){
- return !ignoreEntitiesWithoutRank; //return false to ignore
- } else {
- return ranking > minimumRequiredEntityRanking;
- }
-
- }
- private File checkFile(String value) {
- return checkFile(value,true,false);
- }
- private File checkFile(String value,boolean file, boolean create) {
- if(value.startsWith(File.pathSeparator)){
- //remove leading path separators!
- value = value.substring(File.pathSeparator.length());
- }
- File testFile = new File(value.toString());
-
- if(!testFile.exists()){
- if(create){ //create
- if(file){
- try {
- testFile.createNewFile();
- } catch (IOException e) {
- throw new IllegalStateException("Unable to create File "+testFile,e);
- }
- } else {
- if(!testFile.mkdir()){
- throw new IllegalStateException("Unable to create Directory "+testFile);
- }
- }
- } else { //not found
- throw new IllegalStateException("File "+testFile.getAbsolutePath()+" does not exist!");
- }
- }
- if(file && !testFile.isFile()){
- throw new IllegalStateException("parsed file "+value+"is not a file!");
- }
- if(!file && !testFile.isDirectory()){
- throw new IllegalStateException("parsed file "+value+"is not a directory!");
- }
- if(!testFile.canRead()){
- throw new IllegalStateException("Unable to read File "+value+"!");
- }
- return testFile;
- }
-
- private void loadRdfFiles(){
- //TcProvider provider = new IndexingModelProvider(indexingDir);
- long start=System.currentTimeMillis();
- log.info(String.format("Loding RDF %d File%s ...",rdfFiles.size(),rdfFiles.size()>1?"s":""));
+ Representation source = null;
+ while(resultSet.hasNext()){
+ stdCount++;
+ repStdCount++;
+ QuerySolution solution =resultSet.next();
+ RDFNode subject = solution.get("resource");
+ if(subject.isURIResource()){
+ String resource = subject.asResource().toString();
+ if(!resource.equals(current)){ //start of next resource -> index current
+ count++;
+ if(count%10000==0){
+ long thisOne = System.currentTimeMillis()-startCurrent;
+ long all = System.currentTimeMillis()-start;
+ log.info(String.format("processed %d resources (%dall %dlast indexed) in %dms (%sms/last | avg: %sms/indexed) std/resource (%s indexed| %s non indexed)",
+ count,indexed,indexed-lastIndexed,thisOne,(float)thisOne/(indexed-lastIndexed),(float)all/indexed,(float)indexedStdCount/indexed,((float)stdCount-indexedStdCount)/(count-indexed)));
+ startCurrent = System.currentTimeMillis();
+ lastIndexed = indexed;
+ }
+ if(source != null){
+ if(processRanking(source)){
+ if(resumeMode && yard.isRepresentation(source.getId())){ //resume mode check
+ //log.info("S<source Resource:\n"+ModelUtils.getRepresentationInfo(source));
+ indexed++;
+ indexedStdCount=indexedStdCount+repStdCount;
+ storeRepresentation(source);
+ //here we need todo the indexing!
+ } //else already indexed -> nothing to do
+ } // else rankging to low -> do not index
+ } //else the first item to index -> ignore
+ //init next resource
+ source = vf.createRepresentation(resource);
+ current = resource;
+ repStdCount = 0;
+ }
+ RDFNode fieldNode = solution.get("field");
+ if(fieldNode.isURIResource()){
+ String field = fieldNode.asResource().getURI();
+ RDFNode value = solution.get("value");
+ if(value.isURIResource()){
+ source.addReference(field, value.asResource().getURI());
+ } else if(value.isLiteral()){
+ Literal literal = value.asLiteral();
+ if(literal.getDatatype() != null){
+ Object literalValue;
+ try {
+ literalValue = literal.getValue();
+ } catch (DatatypeFormatException e) {
+ log.warn(" Unable to convert "+literal.getLexicalForm()+" to "+literal.getDatatype()+"-> use lecicalForm");
+ literalValue = literal.getLexicalForm();
+ }
+ if(literalValue instanceof BaseDatatype.TypedValue){
+ source.add(field, ((BaseDatatype.TypedValue)literalValue).lexicalValue);
+ } else if(literalValue instanceof XSDDateTime) {
+ source.add(field, ((XSDDateTime)literalValue).asCalendar().getTime()); //Rick uses the time
+ } else if(literalValue instanceof XSDDuration) {
+ source.add(field, literalValue.toString());
+ } else {
+ source.add(field, literalValue);
+ }
+ } else {
+ String lang = literal.getLanguage();
+ if(lang != null && lang.isEmpty()){
+ lang = null;
+ }
+ source.addNaturalText(field, literal.getLexicalForm(),lang);
+ }
+ }
+ }
+ } else {
+ log.warn(String.format("Current Subject %s is not a URI Resource -> ignored",subject));
+ }
+ } //end while
+ long end = System.currentTimeMillis();
+ log.info(String.format("%d in %dms (%sms/item | %sstd/resource)",count,end-start,""+((float)end - start)/count,""+(float)stdCount/count));
+ }
+
+
+ private boolean processRanking(Representation source) {
+ Float ranking = entityRankings == null ? null :entityRankings.get(source.getId());
+ //ignore values lower than 0
+ if(ranking != null){
+ if(ranking < 0){
+ ranking = null;
+ }
+ }
+ if(ranking != null && ranking > 1){
+ log.warn("Parse Ranking Map contains Entity Ranking > 1 (ranking="+ranking+") for Entity "+source.getId()+" -> use 1.0 as Ranking!");
+ ranking = 1f;
+ }
+ if(ranking == null){
+ for(Iterator<Object> values =source.get(entityRankingField);values.hasNext() && ranking == null;){
+ Object value = values.next();
+ if(value instanceof Float){
+ ranking = (Float) value;
+ } else {
+ try {
+ ranking = Float.parseFloat(value.toString());
+ } catch (NumberFormatException e) {
+ log.warn(String.format("Unable to parse the Entity Ranking from field %s=%s[type=%s] -> The Document Boost MUST BE a Float value!",entityRankingField,value,value.getClass()));
+ }
+ }
+ }
+ } else {
+ source.set(entityRankingField, ranking);
+ }
+ if(ranking != null && ranking > 1){
+ log.warn("Parse RDF data include a entity ranking > 1 (ranking="+ranking+") for Entity "+source.getId()+" and Field "+entityRankingField+"-> use 1.0 as Ranking!");
+ ranking = 1f;
+ }
+ if(ranking == null && this.defaultEntityRanking >= 0){
+ //set to default
+ ranking = defaultEntityRanking;
+ source.set(entityRankingField, ranking);
+ }
+ if(ranking == null){
+ return !ignoreEntitiesWithoutRank; //return false to ignore
+ } else {
+ return ranking > minimumRequiredEntityRanking;
+ }
+
+ }
+ private File checkFile(String value) {
+ return checkFile(value,true,false);
+ }
+ private File checkFile(String value,boolean file, boolean create) {
+ if(value.startsWith(File.pathSeparator)){
+ //remove leading path separators!
+ value = value.substring(File.pathSeparator.length());
+ }
+ File testFile = new File(value.toString());
+
+ if(!testFile.exists()){
+ if(create){ //create
+ if(file){
+ try {
+ testFile.createNewFile();
+ } catch (IOException e) {
+ throw new IllegalStateException("Unable to create File "+testFile,e);
+ }
+ } else {
+ if(!testFile.mkdir()){
+ throw new IllegalStateException("Unable to create Directory "+testFile);
+ }
+ }
+ } else { //not found
+ throw new IllegalStateException("File "+testFile.getAbsolutePath()+" does not exist!");
+ }
+ }
+ if(file && !testFile.isFile()){
+ throw new IllegalStateException("parsed file "+value+"is not a file!");
+ }
+ if(!file && !testFile.isDirectory()){
+ throw new IllegalStateException("parsed file "+value+"is not a directory!");
+ }
+ if(!testFile.canRead()){
+ throw new IllegalStateException("Unable to read File "+value+"!");
+ }
+ return testFile;
+ }
+
+ private void loadRdfFiles(){
+ //TcProvider provider = new IndexingModelProvider(indexingDir);
+ long start=System.currentTimeMillis();
+ log.info(String.format("Loding RDF %d File%s ...",rdfFiles.size(),rdfFiles.size()>1?"s":""));
for (File modelFile : rdfFiles) {
- long startFile = System.currentTimeMillis();
- log.info(String.format(" > loading '%s' into model '%s'...", modelFile, modelName));
+ long startFile = System.currentTimeMillis();
+ log.info(String.format(" > loading '%s' into model '%s'...", modelFile, modelName));
String name = modelFile.getName();
if(name.endsWith(".zip")){
- log.info(" - processing Zip-Archive Entries:");
- try {
- ZipFile zipArchive = new ZipFile(modelFile);
- Enumeration<ZipArchiveEntry> entries = zipArchive.getEntries();
- while(entries.hasMoreElements()){
- ZipArchiveEntry entry = entries.nextElement();
- if(!entry.isDirectory()){
- String entryName = entry.getName();
- log.info(String.format(" o entry '%s' into model '%s'...", entryName, modelName));
+ log.info(" - processing Zip-Archive Entries:");
+ try {
+ ZipFile zipArchive = new ZipFile(modelFile);
+ Enumeration<ZipArchiveEntry> entries = zipArchive.getEntries();
+ while(entries.hasMoreElements()){
+ ZipArchiveEntry entry = entries.nextElement();
+ if(!entry.isDirectory()){
+ String entryName = entry.getName();
+ log.info(String.format(" o entry '%s' into model '%s'...", entryName, modelName));
importRdfData(zipArchive.getInputStream(entry), entryName);
- }
- }
- } catch (IOException e) {
- throw new IllegalStateException(e);
- }
+ }
+ }
+ } catch (IOException e) {
+ throw new IllegalStateException(e);
+ }
} else {
InputStream is;
- try {
- is = new FileInputStream(modelFile);
- } catch (FileNotFoundException e) {
- //during init it is checked that files exists and are files and there is read access
- //so this can only happen if someone deletes the file inbetween
- throw new IllegalStateException(e);
- }
+ try {
+ is = new FileInputStream(modelFile);
+ } catch (FileNotFoundException e) {
+ //during init it is checked that files exists and are files and there is read access
+ //so this can only happen if someone deletes the file inbetween
+ throw new IllegalStateException(e);
+ }
importRdfData(is, name);
}
- //add the parsed Triples to the indexing graph!
+ //add the parsed Triples to the indexing graph!
//QUESTION: Does that load the whole file into memory?
// indexingGraph.addAll(parser.parse(is, format, null));
log.info(String.format(" - completed in %d seconds", (System.currentTimeMillis()-startFile)/1000));
}
log.info(String.format(" ... %d files imported in %d seconds", rdfFiles.size(),(System.currentTimeMillis()-start)/1000));
- }
- /**
- * This method imports the data from an input stream. The name is used to
- * guess the RDF format used. The stream may be come directly form a file,
- * an archive, an URL or an entry in an ZIP file
- * @param is
- * @param name
- */
- private void importRdfData(InputStream is, String name) {
+ }
+ /**
+ * This method imports the data from an input stream. The name is used to
+ * guess the RDF format used. The stream may be come directly form a file,
+ * an archive, an URL or an entry in an ZIP file
+ * @param is
+ * @param name
+ */
+ private void importRdfData(InputStream is, String name) {
if (name.endsWith(".gz")) {
try {
- is = new GZIPInputStream(is);
- } catch (IOException e) {
- //during init it is checked that files exists and are files and there is read access
- //so this can only happen if someone deletes the file inbetween
- throw new IllegalStateException(e);
- }
+ is = new GZIPInputStream(is);
+ } catch (IOException e) {
+ //during init it is checked that files exists and are files and there is read access
+ //so this can only happen if someone deletes the file inbetween
+ throw new IllegalStateException(e);
+ }
name = name.replaceFirst("\\.gz$", "");
log.info(" - from GZIP Archive");
} else if (name.endsWith(".bz2")) {
try {
- is = new BZip2CompressorInputStream(is);
- } catch (IOException e) {
- //during init it is checked that files exists and are files and there is read access
- //so this can only happen if someone deletes the file inbetween
- throw new IllegalStateException(e);
- }
+ is = new BZip2CompressorInputStream(is);
+ } catch (IOException e) {
+ //during init it is checked that files exists and are files and there is read access
+ //so this can only happen if someone deletes the file inbetween
+ throw new IllegalStateException(e);
+ }
name = name.replaceFirst("\\.bz2$", "");
log.info(" - from BZip2 Archive");
}//TODO: No Zip Files inside Zip Files supported :o( ^^
Lang format = Lang.guess(name);
-// if (name.endsWith(".nt")) {
-// format = Lang.NTRIPLES;
-// } else if (name.endsWith(".n3")) {
-// format = Lang.N3;
-// } else {// XML is the default format
-// format = Lang.RDFXML;
-// }
- //For N-Triple we can use the TDBLoader
- if(format == Lang.NTRIPLES){
- TDBLoader.load(indexingDataset, is,true);
- } else if(format != Lang.RDFXML){
- //use RIOT to parse the format but with a special configuration
- //RiotReader!
- TDBLoader loader = new TDBLoader() ;
- loader.setShowProgress(true);
- Destination<Triple> dest = createDestination();
- dest.start() ;
- RiotReader.parseTriples(is, format, null, dest) ;
- dest.finish() ;
- } else { //RDFXML
- //in that case we need to use ARP
- Model model = ModelFactory.createModelForGraph(indexingDataset.getDefaultGraph());
- model.read(is, null);
- }
- }
- /**
- * Creates a triple destination for the default dataset of the
- * {@link #indexingDataset}.
- * This code is based on how Destinations are created in the {@link BulkLoader},
- * implementation. Note that
- * {@link BulkLoader#loadDefaultGraph(DatasetGraphTDB, InputStream, boolean)}
- * can not be used for formats other than {@link Lang#NTRIPLES} because it
- * hard codes this format for loading data form the parsed InputStream.
- * @return the destination!
- */
- private Destination<Triple> createDestination() {
- LoadMonitor monitor = new LoadMonitor(indexingDataset, log, "triples",50000,100000);
+// if (name.endsWith(".nt")) {
+// format = Lang.NTRIPLES;
+// } else if (name.endsWith(".n3")) {
+// format = Lang.N3;
+// } else {// XML is the default format
+// format = Lang.RDFXML;
+// }
+ //For N-Triple we can use the TDBLoader
+ if(format == Lang.NTRIPLES){
+ TDBLoader.load(indexingDataset, is,true);
+ } else if(format != Lang.RDFXML){
+ //use RIOT to parse the format but with a special configuration
+ //RiotReader!
+ TDBLoader loader = new TDBLoader() ;
+ loader.setShowProgress(true);
+ Destination<Triple> dest = createDestination();
+ dest.start() ;
+ RiotReader.parseTriples(is, format, null, dest) ;
+ dest.finish() ;
+ } else { //RDFXML
+ //in that case we need to use ARP
+ Model model = ModelFactory.createModelForGraph(indexingDataset.getDefaultGraph());
+ model.read(is, null);
+ }
+ }
+ /**
+ * Creates a triple destination for the default dataset of the
+ * {@link #indexingDataset}.
+ * This code is based on how Destinations are created in the {@link BulkLoader},
+ * implementation. Note that
+ * {@link BulkLoader#loadDefaultGraph(DatasetGraphTDB, InputStream, boolean)}
+ * can not be used for formats other than {@link Lang#NTRIPLES} because it
+ * hard codes this format for loading data form the parsed InputStream.
+ * @return the destination!
+ */
+ private Destination<Triple> createDestination() {
+ LoadMonitor monitor = new LoadMonitor(indexingDataset, log, "triples",50000,100000);
final LoaderNodeTupleTable loaderTriples = new LoaderNodeTupleTable(indexingDataset.getTripleTable().getNodeTupleTable(), "triples", monitor) ;
-
- Destination<Triple> sink = new Destination<Triple>() {
- long count = 0 ;
- final public void start()
- {
- loaderTriples.loadStart() ;
- loaderTriples.loadDataStart() ;
- }
- final public void send(Triple triple)
- {
- loaderTriples.load(triple.getSubject(), triple.getPredicate(), triple.getObject()) ;
- count++ ;
- }
-
- final public void flush() { }
- public void close() { }
-
- final public void finish()
- {
- loaderTriples.loadDataFinish() ;
- loaderTriples.loadIndexStart() ;
- loaderTriples.loadIndexFinish() ;
- loaderTriples.loadFinish() ;
- }
- } ;
- return sink ;
- }
-
-
-
- /**
- * The List used to cache up to {@link #indexingChunkSize} Representations
- * before they are stored in the Yard.
- */
- private List<Representation> chunkCache = new ArrayList<Representation>(this.indexingChunkSize);
- private void storeRepresentation(Representation source) throws YardException{
- if(source != null){
- chunkCache.add(
- mapper ==null?source: //if no mappings -> store the source
- //else process the field mappings
- mapper.applyMappings(source,vf.createRepresentation(source.getId())));
- }
- if(chunkCache.size()>=indexingChunkSize){
- yard.store(chunkCache);
- chunkCache.clear();
- }
- }
-
- /**
- * As the last step we need to create the baseMappings configuration
- * needed to used the Index as RICK full cache!
- * @throws YardException would be really bad if after successfully indexing
- * about 8 millions of documents we get an error from the yard at the
- * last possible opportunity :(
- */
- private void writeCacheBaseConfiguration() throws YardException {
- log.info("Write BaseMappings for geonames.org Cache");
- if(mapper != null){
- CacheUtils.storeBaseMappingsConfiguration(yard, mapper);
- }
- log.info(" < completed");
- }
-//------------------------------------------------------------------------------
+
+ Destination<Triple> sink = new Destination<Triple>() {
+ long count = 0 ;
+ final public void start()
+ {
+ loaderTriples.loadStart() ;
+ loaderTriples.loadDataStart() ;
+ }
+ final public void send(Triple triple)
+ {
+ loaderTriples.load(triple.getSubject(), triple.getPredicate(), triple.getObject()) ;
+ count++ ;
+ }
+
+ final public void flush() { }
+ public void close() { }
+
+ final public void finish()
+ {
+ loaderTriples.loadDataFinish() ;
+ loaderTriples.loadIndexStart() ;
+ loaderTriples.loadIndexFinish() ;
+ loaderTriples.loadFinish() ;
+ }
+ } ;
+ return sink ;
+ }
+
+
+
+ /**
+ * The List used to cache up to {@link #indexingChunkSize} Representations
+ * before they are stored in the Yard.
+ */
+ private List<Representation> chunkCache = new ArrayList<Representation>(this.indexingChunkSize);
+ private void storeRepresentation(Representation source) throws YardException{
+ if(source != null){
+ chunkCache.add(
+ mapper ==null?source: //if no mappings -> store the source
+ //else process the field mappings
+ mapper.applyMappings(source,vf.createRepresentation(source.getId())));
+ }
+ if(chunkCache.size()>=indexingChunkSize){
+ yard.store(chunkCache);
+ chunkCache.clear();
+ }
+ }
+
+ /**
+ * As the last step we need to create the baseMappings configuration
+ * needed to used the Index as RICK full cache!
+ * @throws YardException would be really bad if after successfully indexing
+ * about 8 millions of documents we get an error from the yard at the
+ * last possible opportunity :(
+ */
+ private void writeCacheBaseConfiguration() throws YardException {
+ log.info("Write BaseMappings for geonames.org Cache");
+ if(mapper != null){
+ CacheUtils.storeBaseMappingsConfiguration(yard, mapper);
+ }
+ log.info(" < completed");
+ }
+//------------------------------------------------------------------------------
// Other implemented variants with less performance than indexResource3!
//------------------------------------------------------------------------------
-// private void indexResource2(Resource resource){
+// private void indexResource2(Resource resource){
// Query q = QueryFactory.create(String.format(resourceQuery,resource.getURI(),resource.getURI()), Syntax.syntaxARQ);
// final ResultSet resultSet = QueryExecutionFactory.create(q, indexingDataset.toDataset()).execSelect();
-// Representation source = vf.createRepresentation(resource.getURI());
-// while(resultSet.hasNext()){
-// QuerySolution solution =resultSet.next();
-// RDFNode fieldNode = solution.get("field");
-// if(fieldNode.isURIResource()){
-// String field = fieldNode.asResource().getURI();
-// RDFNode value = solution.get("value");
-// if(value.isURIResource()){
-// source.addReference(field, value.asResource().getURI());
-// } else if(value.isLiteral()){
-// Literal literal = value.asLiteral();
-// if(literal.getDatatype() != null){
-// Object literalValue;
-// try {
-// literalValue = literal.getValue();
-// } catch (DatatypeFormatException e) {
-// log.warn(" Unable to convert "+literal.getLexicalForm()+" to "+literal.getDatatype()+"-> use lecicalForm");
-// literalValue = literal.getLexicalForm();
-// }
-// if(literalValue instanceof BaseDatatype.TypedValue){
-// source.add(field, literal.getLexicalForm());
-// } else {
-// source.add(field, literal.getValue());
-// }
-// } else {
-// String lang = literal.getLanguage();
-// if(lang != null && lang.isEmpty()){
-// lang = null;
-// }
-// source.addNaturalText(field, literal.getLexicalForm(),lang);
-// }
-// }
-// }
-// }
-// //log.info("S<source Resource:\n"+ModelUtils.getRepresentationInfo(source));
-// }
- private void indexRanked() throws YardException {
- if(entityRankings == null){
- throw new IllegalStateException("Unable to index with Etity Ranking Mode if no Entity Rankings are present!");
- }
- long count = 0;
- long alreadyIndexed = 0;
- long stdCount = 0;
- long notFound = 0;
- long start = System.currentTimeMillis();
- long startCurrent = System.currentTimeMillis();
- for(Entry<String,Float> entry : entityRankings.entrySet()){
- if(entry.getValue() < minimumRequiredEntityRanking){
- continue; //ignore entities with rank < the min required one
- }
- count++;
- if(count%1000 == 0){
- long thisOne = System.currentTimeMillis()-startCurrent;
- long all = System.currentTimeMillis()-start;
- log.info(String.format("processed %s resources %s indexed in %sms (%sms/item | avg: %sms/item) %s std/resourc | %s not found",
- count, count-alreadyIndexed, thisOne,(float)thisOne/1000,(float)all/count,(float)stdCount/(count-alreadyIndexed),notFound));
- startCurrent = System.currentTimeMillis();
- }
- if(resumeMode && yard.isRepresentation(entry.getKey())){
- alreadyIndexed++;
- continue;
- }
- Representation source = vf.createRepresentation(entry.getKey());
- Node resource = Node.createURI(entry.getKey());
- ExtendedIterator<Triple> outgoing = indexingDataset.getDefaultGraph().find(resource, null, null);
- boolean found = outgoing.hasNext();
- while(outgoing.hasNext()){ //iterate over the statements for that resource
- stdCount++;
- Triple statement = outgoing.next();
- Node predicate = statement.getPredicate();
- if(predicate == null || !predicate.isURI()){
- log.warn(String.format("Ignore field %s for resource %s because it is null or not an URI!",
- predicate,resource));
- } else {
- String field = statement.getPredicate().getURI();
- Node object = statement.getObject();
- if(object == null){
- log.warn(String.format("Encountered NULL value for field %s and resource %s",
- predicate,resource));
- }else if(object.isURI()){ //add a reference
- source.addReference(field, object.getURI());
- } else if(object.isLiteral()){ //add a value or a text depending on the dataType
- LiteralLabel ll = object.getLiteral();
- //if the dataType == null , than we can expect a plain literal
- RDFDatatype dataType = ll.getDatatype();
- if(dataType != null){ //add a value
- Object literalValue;
- try {
- literalValue = ll.getValue();
- if(literalValue instanceof BaseDatatype.TypedValue){
- //used for unknown data types
- // -> in such cases yust use the lecial type
- source.add(field, ((BaseDatatype.TypedValue)literalValue).lexicalValue);
- } else if(literalValue instanceof XSDDateTime) {
- source.add(field, ((XSDDateTime)literalValue).asCalendar().getTime()); //Rick uses the time
- } else if(literalValue instanceof XSDDuration) {
- source.add(field, literalValue.toString());
- } else {
- source.add(field, literalValue);
- }
- } catch (DatatypeFormatException e) {
- log.warn(" Unable to convert "+ll.getLexicalForm()+" to "+ll.getDatatype()+"-> use lecicalForm");
- literalValue = ll.getLexicalForm();
- }
- } else { //add a text
- String language = ll.language();
- if(language!=null && language.length()<1){
- language = null;
- }
- source.addNaturalText(field, ll.getLexicalForm(), language);
- }
- // "" is parsed if there is no language
- } else {
- if(object.isBlank()){
- log.info(String.format("ignoreing blank node value %s for field %s and Resource %s!",
- object,field,resource));
- } else {
- log.warn(String.format("ignoreing value %s for field %s and Resource %s because it is of an unsupported type!",
- object,field,resource));
- }
- } //end different value node type
- } //end else predicate != null
- } //end iteration over resource triple
- if(found) {
- storeRepresentation(source);
- //log.info("Resource: \n"+ModelUtils.getRepresentationInfo(source));
- } else {
- //log.info("No Statements found for "+entry.getKey()+" (ranking="+entry.getValue()+")!");
- notFound++;
- }
- }
- }
+// Representation source = vf.createRepresentation(resource.getURI());
+// while(resultSet.hasNext()){
+// QuerySolution solution =resultSet.next();
+// RDFNode fieldNode = solution.get("field");
+// if(fieldNode.isURIResource()){
+// String field = fieldNode.asResource().getURI();
+// RDFNode value = solution.get("value");
+// if(value.isURIResource()){
+// source.addReference(field, value.asResource().getURI());
+// } else if(value.isLiteral()){
+// Literal literal = value.asLiteral();
+// if(literal.getDatatype() != null){
+// Object literalValue;
+// try {
+// literalValue = literal.getValue();
+// } catch (DatatypeFormatException e) {
+// log.warn(" Unable to convert "+literal.getLexicalForm()+" to "+literal.getDatatype()+"-> use lecicalForm");
+// literalValue = literal.getLexicalForm();
+// }
+// if(literalValue instanceof BaseDatatype.TypedValue){
+// source.add(field, literal.getLexicalForm());
+// } else {
+// source.add(field, literal.getValue());
+// }
+// } else {
+// String lang = literal.getLanguage();
+// if(lang != null && lang.isEmpty()){
+// lang = null;
+// }
+// source.addNaturalText(field, literal.getLexicalForm(),lang);
+// }
+// }
+// }
+// }
+// //log.info("S<source Resource:\n"+ModelUtils.getRepresentationInfo(source));
+// }
+ private void indexRanked() throws YardException {
+ if(entityRankings == null){
+ throw new IllegalStateException("Unable to index with Etity Ranking Mode if no Entity Rankings are present!");
+ }
+ long count = 0;
+ long alreadyIndexed = 0;
+ long stdCount = 0;
+ long notFound = 0;
+ long start = System.currentTimeMillis();
+ long startCurrent = System.currentTimeMillis();
+ for(Entry<String,Float> entry : entityRankings.entrySet()){
+ if(entry.getValue() < minimumRequiredEntityRanking){
+ continue; //ignore entities with rank < the min required one
+ }
+ count++;
+ if(count%1000 == 0){
+ long thisOne = System.currentTimeMillis()-startCurrent;
+ long all = System.currentTimeMillis()-start;
+ log.info(String.format("processed %s resources %s indexed in %sms (%sms/item | avg: %sms/item) %s std/resourc | %s not found",
+ count, count-alreadyIndexed, thisOne,(float)thisOne/1000,(float)all/count,(float)stdCount/(count-alreadyIndexed),notFound));
+ startCurrent = System.currentTimeMillis();
+ }
+ if(resumeMode && yard.isRepresentation(entry.getKey())){
+ alreadyIndexed++;
+ continue;
+ }
+ Representation source = vf.createRepresentation(entry.getKey());
+ Node resource = Node.createURI(entry.getKey());
+ ExtendedIterator<Triple> outgoing = indexingDataset.getDefaultGraph().find(resource, null, null);
+ boolean found = outgoing.hasNext();
+ while(outgoing.hasNext()){ //iterate over the statements for that resource
+ stdCount++;
+ Triple statement = outgoing.next();
+ Node predicate = statement.getPredicate();
+ if(predicate == null || !predicate.isURI()){
+ log.warn(String.format("Ignore field %s for resource %s because it is null or not an URI!",
+ predicate,resource));
+ } else {
+ String field = statement.getPredicate().getURI();
+ Node object = statement.getObject();
+ if(object == null){
+ log.warn(String.format("Encountered NULL value for field %s and resource %s",
+ predicate,resource));
+ }else if(object.isURI()){ //add a reference
+ source.addReference(field, object.getURI());
+ } else if(object.isLiteral()){ //add a value or a text depending on the dataType
+ LiteralLabel ll = object.getLiteral();
+ //if the dataType == null , than we can expect a plain literal
+ RDFDatatype dataType = ll.getDatatype();
+ if(dataType != null){ //add a value
+ Object literalValue;
+ try {
+ literalValue = ll.getValue();
+ if(literalValue instanceof BaseDatatype.TypedValue){
+ //used for unknown data types
+ // -> in such cases yust use the lecial type
+ source.add(field, ((BaseDatatype.TypedValue)literalValue).lexicalValue);
+ } else if(literalValue instanceof XSDDateTime) {
+ source.add(field, ((XSDDateTime)literalValue).asCalendar().getTime()); //Rick uses the time
+ } else if(literalValue instanceof XSDDuration) {
+ source.add(field, literalValue.toString());
+ } else {
+ source.add(field, literalValue);
+ }
+ } catch (DatatypeFormatException e) {
+ log.warn(" Unable to convert "+ll.getLexicalForm()+" to "+ll.getDatatype()+"-> use lecicalForm");
+ literalValue = ll.getLexicalForm();
+ }
+ } else { //add a text
+ String language = ll.language();
+ if(language!=null && language.length()<1){
+ language = null;
+ }
+ source.addNaturalText(field, ll.getLexicalForm(), language);
+ }
+ // "" is parsed if there is no language
+ } else {
+ if(object.isBlank()){
+ log.info(String.format("ignoreing blank node value %s for field %s and Resource %s!",
+ object,field,resource));
+ } else {
+ log.warn(String.format("ignoreing value %s for field %s and Resource %s because it is of an unsupported type!",
+ object,field,resource));
+ }
+ } //end different value node type
+ } //end else predicate != null
+ } //end iteration over resource triple
+ if(found) {
+ storeRepresentation(source);
+ //log.info("Resource: \n"+ModelUtils.getRepresentationInfo(source));
+ } else {
+ //log.info("No Statements found for "+entry.getKey()+" (ranking="+entry.getValue()+")!");
+ notFound++;
+ }
+ }
+ }
}