You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@marmotta.apache.org by wi...@apache.org on 2014/01/17 15:17:57 UTC

git commit: MARMOTTA-423: preparing the infrastructure to plugin rules to fix the wrong turtle

Updated Branches:
  refs/heads/develop dd23600a0 -> 9e737af91


MARMOTTA-423: preparing the infrastructure to plugin rules to fix the wrong turtle


Project: http://git-wip-us.apache.org/repos/asf/marmotta/repo
Commit: http://git-wip-us.apache.org/repos/asf/marmotta/commit/9e737af9
Tree: http://git-wip-us.apache.org/repos/asf/marmotta/tree/9e737af9
Diff: http://git-wip-us.apache.org/repos/asf/marmotta/diff/9e737af9

Branch: refs/heads/develop
Commit: 9e737af916fcaae92d406f127607b3628a768407
Parents: dd23600
Author: Sergio Fernández <wi...@apache.org>
Authored: Fri Jan 17 15:17:44 2014 +0100
Committer: Sergio Fernández <wi...@apache.org>
Committed: Fri Jan 17 15:17:44 2014 +0100

----------------------------------------------------------------------
 .../ldclient/ldclient-provider-freebase/pom.xml |  3 +-
 .../endpoint/freebase/FreebaseEndpoint.java     |  5 +-
 .../provider/freebase/FreebaseProvider.java     | 53 +++++++++++++++++---
 3 files changed, 49 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/marmotta/blob/9e737af9/libraries/ldclient/ldclient-provider-freebase/pom.xml
----------------------------------------------------------------------
diff --git a/libraries/ldclient/ldclient-provider-freebase/pom.xml b/libraries/ldclient/ldclient-provider-freebase/pom.xml
index 4b55598..4ca4dd1 100644
--- a/libraries/ldclient/ldclient-provider-freebase/pom.xml
+++ b/libraries/ldclient/ldclient-provider-freebase/pom.xml
@@ -57,9 +57,8 @@
             <scope>test</scope>
         </dependency>
         <dependency>
-            <groupId>commons-io</groupId>
+            <groupId>org.apache.commons</groupId>
             <artifactId>commons-io</artifactId>
-            <scope>test</scope>
         </dependency>
         <dependency>
             <groupId>ch.qos.logback</groupId>

http://git-wip-us.apache.org/repos/asf/marmotta/blob/9e737af9/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/endpoint/freebase/FreebaseEndpoint.java
----------------------------------------------------------------------
diff --git a/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/endpoint/freebase/FreebaseEndpoint.java b/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/endpoint/freebase/FreebaseEndpoint.java
index 1964365..688ea5b 100644
--- a/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/endpoint/freebase/FreebaseEndpoint.java
+++ b/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/endpoint/freebase/FreebaseEndpoint.java
@@ -19,16 +19,17 @@ package org.apache.marmotta.ldclient.endpoint.freebase;
 
 import org.apache.marmotta.commons.http.ContentType;
 import org.apache.marmotta.ldclient.api.endpoint.Endpoint;
+import org.apache.marmotta.ldclient.provider.freebase.FreebaseProvider;
 
 /**
- * A particular endpoint for accessing RDF from Freebase.
+ * Endpoint for accessing RDF from Freebase.
  *
  * @author Sergio Fernández
  */
 public class FreebaseEndpoint extends Endpoint {
 
     public FreebaseEndpoint() {
-        super("Freebase", "Freebase", "http(s?)://rdf\\.freebase\\.com/ns/.*", null, 86400L);
+        super(FreebaseProvider.NAME, FreebaseProvider.NAME, FreebaseProvider.PATTERN, null, 86400L);
         setPriority(PRIORITY_MEDIUM);
         addContentType(new ContentType("text", "turtle", 1.0));
         addContentType(new ContentType("text", "plain", 0.2));

http://git-wip-us.apache.org/repos/asf/marmotta/blob/9e737af9/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java
----------------------------------------------------------------------
diff --git a/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java b/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java
index 38a03d8..699c294 100644
--- a/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java
+++ b/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java
@@ -20,8 +20,8 @@ package org.apache.marmotta.ldclient.provider.freebase;
 import com.google.common.base.Preconditions;
 
 import javolution.util.function.Predicate;
+import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
-import org.apache.marmotta.commons.http.ContentType;
 import org.apache.marmotta.commons.sesame.model.ModelCommons;
 import org.apache.marmotta.ldclient.api.endpoint.Endpoint;
 import org.apache.marmotta.ldclient.exception.DataRetrievalException;
@@ -30,12 +30,16 @@ import org.openrdf.model.Model;
 import org.openrdf.model.Statement;
 import org.openrdf.rio.RDFFormat;
 import org.openrdf.rio.RDFParseException;
-import org.openrdf.rio.RDFParserRegistry;
+import org.openrdf.rio.Rio;
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.StringWriter;
 import java.util.Collections;
 import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 /**
  * Linked Data patched data provider to Freebase.
@@ -44,8 +48,12 @@ import java.util.List;
  */
 public class FreebaseProvider extends AbstractHttpProvider {
 
-    public static final String PROVIDER_NAME = "Freebase";
+    public static final String NAME = "Freebase";
+    public static final String PATTERN = "http(s?)://rdf\\.freebase\\.com/ns/.*";
     public static final String API = "https://www.googleapis.com/freebase/v1/rdf/";
+    public static final RDFFormat DEFAULT_RDF_FORMAT = RDFFormat.TURTLE;
+    public static final String DEFAULT_ENCODING = "UTF-8";
+    private static final Pattern CHARSET_PATTERN = Pattern.compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)");
 
     /**
      * Return the name of this data provider. To be used e.g. in the configuration and in log messages.
@@ -54,7 +62,7 @@ public class FreebaseProvider extends AbstractHttpProvider {
      */
     @Override
     public String getName() {
-        return PROVIDER_NAME;
+        return NAME;
     }
 
     @Override
@@ -73,7 +81,7 @@ public class FreebaseProvider extends AbstractHttpProvider {
      */
     @Override
     public List<String> buildRequestUrl(String uri, Endpoint endpoint) {
-        Preconditions.checkNotNull(uri);
+        Preconditions.checkState(StringUtils.isNotBlank(uri));
         String id = uri.substring(uri.lastIndexOf('/') + 1);
         String url = API + id.replace('.', '/');
         return Collections.singletonList(url);
@@ -81,10 +89,24 @@ public class FreebaseProvider extends AbstractHttpProvider {
 
     @Override
     public List<String> parseResponse(final String resourceUri, final String requestUrl, Model triples, InputStream in, final String contentType) throws DataRetrievalException {
-        Preconditions.checkState(contentType.contains("text/plain"), "Unexpected content type: " + contentType);
-        RDFFormat format = RDFFormat.TURTLE;
+
+        RDFFormat format;
+        if (StringUtils.isNotBlank(contentType) && (contentType.contains("text/plain")||contentType.contains("text/turtle"))) {
+            format = DEFAULT_RDF_FORMAT;
+        } else {
+            format = Rio.getWriterFormatForMIMEType(contentType, DEFAULT_RDF_FORMAT);
+        }
+
+        String encoding = DEFAULT_ENCODING;
+        Matcher m = CHARSET_PATTERN.matcher(contentType);
+        if (StringUtils.isNotBlank(contentType) && m.find()) {
+            encoding = m.group(1).trim().toUpperCase();
+        } else {
+            encoding = DEFAULT_ENCODING;
+        }
+
         try {
-            ModelCommons.add(triples, in, resourceUri, format, new Predicate<Statement>() {
+            ModelCommons.add(triples, fix(in, encoding), resourceUri, format, new Predicate<Statement>() {
                 @Override
                 public boolean test(Statement param) {
                     return StringUtils.equals(param.getSubject().stringValue(), resourceUri);
@@ -96,6 +118,21 @@ public class FreebaseProvider extends AbstractHttpProvider {
         } catch (IOException e) {
             throw new DataRetrievalException("I/O error while trying to read remote Turtle from Freebase", e);
         }
+
+    }
+
+    /**
+     * Fixes Freebase deficiencies on Turtle serialization
+     *
+     * @param in stream with the raw data
+     * @return fixed stream
+     */
+    private InputStream fix(InputStream in, String encoding) throws IOException {
+        StringWriter writer = new StringWriter();
+        IOUtils.copy(in, writer, encoding);
+        String raw = writer.toString();
+        //TODO: perform fixes
+        return new ByteArrayInputStream(raw.getBytes(encoding));
     }
 
 }