You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@marmotta.apache.org by wi...@apache.org on 2014/06/13 10:58:46 UTC

[086/100] [abbrv] git commit: moved to commons the quick (and dirty) trick to encode latin1 string from freebase

moved to commons the quick (and dirty) trick to encode latin1 string from freebase


Project: http://git-wip-us.apache.org/repos/asf/marmotta/repo
Commit: http://git-wip-us.apache.org/repos/asf/marmotta/commit/b00e935d
Tree: http://git-wip-us.apache.org/repos/asf/marmotta/tree/b00e935d
Diff: http://git-wip-us.apache.org/repos/asf/marmotta/diff/b00e935d

Branch: refs/heads/ldp
Commit: b00e935d0fce5944367ed6f41f8ae108e008edf3
Parents: c5f0d76
Author: Sergio Fernández <wi...@apache.org>
Authored: Thu Apr 24 20:16:07 2014 +0200
Committer: Sergio Fernández <wi...@apache.org>
Committed: Thu Apr 24 20:16:07 2014 +0200

----------------------------------------------------------------------
 .../marmotta/commons/util/StringUtils.java      | 40 ++++++++++++++++++++
 .../ldclient/ldclient-provider-freebase/pom.xml |  4 ++
 .../provider/freebase/FreebaseProvider.java     | 20 ++--------
 3 files changed, 47 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/marmotta/blob/b00e935d/commons/marmotta-commons/src/main/java/org/apache/marmotta/commons/util/StringUtils.java
----------------------------------------------------------------------
diff --git a/commons/marmotta-commons/src/main/java/org/apache/marmotta/commons/util/StringUtils.java b/commons/marmotta-commons/src/main/java/org/apache/marmotta/commons/util/StringUtils.java
new file mode 100644
index 0000000..3e597c4
--- /dev/null
+++ b/commons/marmotta-commons/src/main/java/org/apache/marmotta/commons/util/StringUtils.java
@@ -0,0 +1,40 @@
+package org.apache.marmotta.commons.util;
+
+/**
+ * Some util string functions
+ *
+ * @author Sergio Fernández
+ */
+public class StringUtils {
+
+    private StringUtils() {
+
+    }
+
+    public static String fixLatin1(String str) {
+        //TODO: find a way to re-code properly the literal
+        //http://www.ic.unicamp.br/~stolfi/EXPORT/www/ISO-8859-1-Encoding.html
+        str = str.replaceAll("\\\\xe1", "á");
+        str = str.replaceAll("\\\\xe2", "â");
+        str = str.replaceAll("\\\\xe3", "ã");
+        str = str.replaceAll("\\\\xe4", "ä");
+        str = str.replaceAll("\\\\xe7", "ç");
+        str = str.replaceAll("\\\\xe8", "è");
+        str = str.replaceAll("\\\\xe9", "é");
+        str = str.replaceAll("\\\\xea", "ê");
+        str = str.replaceAll("\\\\xeb", "ë");
+        str = str.replaceAll("\\\\xed", "í");
+        str = str.replaceAll("\\\\xee", "î");
+        str = str.replaceAll("\\\\xef", "ï");
+        str = str.replaceAll("\\\\xf3", "ó");
+        str = str.replaceAll("\\\\xf4", "ô");
+        str = str.replaceAll("\\\\xf6", "ö");
+        str = str.replaceAll("\\\\xf9", "ù");
+        str = str.replaceAll("\\\\xfb", "û");
+        str = str.replaceAll("\\\\xfc", "ü");
+        str = str.replaceAll("\\\\xfa", "ú");
+        str = str.replaceAll("\\\\x", ""); //FIXME: wrong, wrong, wrong!
+        return str;
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/marmotta/blob/b00e935d/libraries/ldclient/ldclient-provider-freebase/pom.xml
----------------------------------------------------------------------
diff --git a/libraries/ldclient/ldclient-provider-freebase/pom.xml b/libraries/ldclient/ldclient-provider-freebase/pom.xml
index 39f9848..5cba92c 100644
--- a/libraries/ldclient/ldclient-provider-freebase/pom.xml
+++ b/libraries/ldclient/ldclient-provider-freebase/pom.xml
@@ -42,6 +42,10 @@
             <groupId>org.openrdf.sesame</groupId>
             <artifactId>sesame-rio-turtle</artifactId>
         </dependency>
+        <dependency>
+            <groupId>org.apache.marmotta</groupId>
+            <artifactId>marmotta-commons</artifactId>
+        </dependency>
 
         <dependency>
             <groupId>junit</groupId>

http://git-wip-us.apache.org/repos/asf/marmotta/blob/b00e935d/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java
----------------------------------------------------------------------
diff --git a/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java b/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java
index ef3652b..fe39296 100644
--- a/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java
+++ b/libraries/ldclient/ldclient-provider-freebase/src/main/java/org/apache/marmotta/ldclient/provider/freebase/FreebaseProvider.java
@@ -183,7 +183,7 @@ public class FreebaseProvider extends AbstractHttpProvider {
                             } else if (o.contains("\\u")) {
                                 o = StringEscapeUtils.unescapeJava(o);
                             } else if (o.contains("\\x")) {
-                                o = fixLatin1(o);
+                                o = org.apache.marmotta.commons.util.StringUtils.fixLatin1(o);
                             }
                             sb.append("    " + p + "    " + o + tripleMatcher.group(3));
                             sb.append("\n");
@@ -213,7 +213,7 @@ public class FreebaseProvider extends AbstractHttpProvider {
 
         //wrong charset
         if (literal.contains("\\x")) {
-            literal = fixLatin1(literal);
+            literal = org.apache.marmotta.commons.util.StringUtils.fixLatin1(literal);
         }
 
         //wrong unicode encoding
@@ -224,18 +224,4 @@ public class FreebaseProvider extends AbstractHttpProvider {
         return literal;
     }
 
-    private String fixLatin1(String str) {
-        //TODO: find a way to re-code properly the literal
-        //http://www.ic.unicamp.br/~stolfi/EXPORT/www/ISO-8859-1-Encoding.html
-        str = str.replaceAll("\\\\xe1", "á");
-        str = str.replaceAll("\\\\xe3", "ã");
-        str = str.replaceAll("\\\\xe7", "ç");
-        str = str.replaceAll("\\\\xe9", "é");
-        str = str.replaceAll("\\\\xed", "í");
-        str = str.replaceAll("\\\\xf3", "ó");
-        str = str.replaceAll("\\\\xfa", "ú");
-        str = str.replaceAll("\\\\x", ""); //FIXME: wrong, wrong, wrong!
-        return str;
-    }
-
-}
\ No newline at end of file
+}