You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@marmotta.apache.org by wi...@apache.org on 2014/01/22 10:47:59 UTC
git commit: MARMOTTA-423: improved language code detection for
literals
Updated Branches:
refs/heads/develop 8fdb64475 -> 904a87360
MARMOTTA-423: improved language code detection for literals
Project: http://git-wip-us.apache.org/repos/asf/marmotta/repo
Commit: http://git-wip-us.apache.org/repos/asf/marmotta/commit/904a8736
Tree: http://git-wip-us.apache.org/repos/asf/marmotta/tree/904a8736
Diff: http://git-wip-us.apache.org/repos/asf/marmotta/diff/904a8736
Branch: refs/heads/develop
Commit: 904a87360b62b011058ddf81e0699fdc8b257ba1
Parents: 8fdb644
Author: Sergio Fernández <wi...@apache.org>
Authored: Wed Jan 22 10:47:47 2014 +0100
Committer: Sergio Fernández <wi...@apache.org>
Committed: Wed Jan 22 10:47:47 2014 +0100
----------------------------------------------------------------------
.../provider/freebase/FreebaseProvider.java | 83 ++++----------------
.../test/freebase/TestFreebaseProvider.java | 4 +-
2 files changed, 17 insertions(+), 70 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/marmotta/blob/904a8736/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java
----------------------------------------------------------------------
diff --git a/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java b/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java
index 1c90eea..ae2fc78 100644
--- a/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java
+++ b/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java
@@ -57,7 +57,7 @@ public class FreebaseProvider extends AbstractHttpProvider {
public static final RDFFormat DEFAULT_RDF_FORMAT = RDFFormat.TURTLE;
public static final String DEFAULT_ENCODING = "UTF-8";
private static final Pattern CHARSET_PATTERN = Pattern.compile("(?i)\\bcharset=\\s*\"?([^\\s;\"]*)");
- private static final Pattern FREEBASE_LITERAL_PATTERN = Pattern.compile("^\\s+([a-z]+:[a-z_]+(?:\\.+[a-z_]+)*)\\s+\"(.*)\"(@[a-z]+)?(;|\\.)$");
+ private static final Pattern FREEBASE_LITERAL_PATTERN = Pattern.compile("^\\s+([a-z]+:[a-z_]+(?:\\.+[a-z_]+)*)\\s+\"(.*)\"(@[a-z]+(\\-[a-z0-9]+)*)?(;|\\.)$");
private static final Pattern FREEBASE_TRIPLE_PATTERN = Pattern.compile("^\\s+([a-z]+:[a-z_]+(?:\\.+[a-z_]+)*)\\s+(.*)(;|\\.)$");
/**
@@ -146,30 +146,26 @@ public class FreebaseProvider extends AbstractHttpProvider {
try {
final String literal = literalMatcher.group(2);
final String fixed = fixLiteral(literal);
- //log.debug("literal: --{}--{}", literal, fixed);
+ log.debug("literal: --{}--{}", literal, fixed);
String triple = literalMatcher.group(1) + " \"" + fixed + "\"";
if (literalMatcher.group(3) != null) {
triple += literalMatcher.group(3);
}
log.debug("new triple: {}", triple);
- sb.append(" " + triple + literalMatcher.group(4));
+ sb.append(" " + triple + literalMatcher.group(5));
sb.append(("\n"));
} catch (Exception e) {
log.error("Error fixing line, so triple ignored: {}", e.getMessage());
log.error("error on line: {}", line);
- if (line.endsWith(".")) {
- sb.replace(sb.length() - 2, sb.length(), ".\n");
- }
+ warrantyClosing(sb, line);
}
} else {
Matcher tripleMatcher = FREEBASE_TRIPLE_PATTERN.matcher(line);
if (tripleMatcher.matches()) {
String p = tripleMatcher.group(1);
if (p.indexOf("..") >= 0) {
- log.warn("ignoring line due wrong property: {}", p);
- if (line.endsWith(".")) {
- sb.replace(sb.length()-2, sb.length(), ".\n");
- }
+ log.debug("ignoring line due wrong property: {}", p);
+ warrantyClosing(sb, line);
} else {
String o = tripleMatcher.group(2);
if (o.charAt(0) == '<') {
@@ -179,9 +175,7 @@ public class FreebaseProvider extends AbstractHttpProvider {
sb.append("\n");
} catch (RuntimeException e) {
log.error("Object uri not valid: {}", o.substring(1, o.length() - 1));
- if (line.endsWith(".")) {
- sb.replace(sb.length()-2, sb.length(), ".\n");
- }
+ warrantyClosing(sb, line);
}
} else {
if (o.contains("$")) {
@@ -201,10 +195,16 @@ public class FreebaseProvider extends AbstractHttpProvider {
}
}
}
- //System.out.println(sb.toString());
+ System.out.println(sb.toString());
return new ByteArrayInputStream(sb.toString().getBytes());
}
+ private void warrantyClosing(StringBuffer sb, String line) {
+ if (line.endsWith(".")) {
+ sb.replace(sb.length()-2, sb.length(), ".\n");
+ }
+ }
+
private String fixLiteral(String literal) throws UnsupportedEncodingException {
//non-escaped quotes
@@ -237,59 +237,4 @@ public class FreebaseProvider extends AbstractHttpProvider {
return str;
}
- /**
- * Decodes Freebase.com keys using the '<code>$0000</code>' encoding for chars.
- * This encoding uses a 4 digit hex number to represent chars See the
- * Freebase documentation for details.
- *
- * NOTE: copied from Stanbol's FreebaseKeyProcessor.decodeKey()
- * @see http://svn.apache.org/repos/asf/stanbol/trunk/entityhub/indexing/freebase/src/main/java/org/apache/stanbol/entityhub/indexing/freebase/processor/FreebaseKeyProcessor.java
- *
- * @param encodedKey encoded key
- * @return decoded key
- */
- private static String decodeKey(String encodedKey){
- StringBuilder key = null; //lazy initialisation for performance
- int index = 0;
- final int length = encodedKey.length();
- while(index < length){
- int next = encodedKey.indexOf('$', index);
- if(next < 0){
- if(key == null){
- return encodedKey; //no decoding needed
- }
- next = length;
- }
- if(key == null){
- //init the StringBuilder with the maximum possible size
- key = new StringBuilder(encodedKey.length());
- }
- if(next > index){ //add chars that do not need decoding
- key.append(encodedKey, index, next);
- }
- if(next < length){ //decode char
- try {
- if(next+4 < length){
- key.appendCodePoint(Integer.parseInt(
- encodedKey.substring(next+1, next+5), 16));
- } else {
- String section = encodedKey.substring(next, length);
- log.warn("Unable to decode section ["+next+"-"+(length)+"|'"
- + section+"'] from key '"+ encodedKey+"'! -> add plain "
- + "section instead!");
- key.append(section);
- }
- } catch (NumberFormatException e) {
- String section = encodedKey.substring(next, next+5);
- log.warn("Unable to decode section ["+next+"-"+(next+5)+"|'"
- + section+"'] from key '"+ encodedKey+"'! -> add plain "
- + "section instead!");
- key.append(section);
- }
- }
- index = next+5; //add the $0000
- }
- return key.toString();
- }
-
}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/marmotta/blob/904a8736/libraries/ldclient/ldclient-provider-freebase/src/test/java/org/apache/marmotta/ldclient/test/freebase/TestFreebaseProvider.java
----------------------------------------------------------------------
diff --git a/libraries/ldclient/ldclient-provider-freebase/src/test/java/org/apache/marmotta/ldclient/test/freebase/TestFreebaseProvider.java b/libraries/ldclient/ldclient-provider-freebase/src/test/java/org/apache/marmotta/ldclient/test/freebase/TestFreebaseProvider.java
index 7858d85..8391ecb 100644
--- a/libraries/ldclient/ldclient-provider-freebase/src/test/java/org/apache/marmotta/ldclient/test/freebase/TestFreebaseProvider.java
+++ b/libraries/ldclient/ldclient-provider-freebase/src/test/java/org/apache/marmotta/ldclient/test/freebase/TestFreebaseProvider.java
@@ -23,7 +23,8 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
- * Test the Freebase LDClient Provider.
+ * Some tests over random data to Freebase to warranty that the provider
+ * fixes some common deficiencies in the original data.
*
* @author Sergio Fernández
*/
@@ -83,6 +84,7 @@ public class TestFreebaseProvider extends ProviderTestBase {
testResource("http://rdf.freebase.com/ns/m.0b1t1");
testResource("http://rdf.freebase.com/ns/m.04jpl");
testResource("http://rdf.freebase.com/ns/m.036wy");
+ testResource("http://rdf.freebase.com/ns/m.01d0fp");
}
}