You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by an...@apache.org on 2012/09/08 22:43:22 UTC

svn commit: r1382362 - in /jena/trunk/jena-arq/src: main/java/org/openjena/atlas/lib/ main/java/org/openjena/riot/ main/java/org/openjena/riot/lang/ test/java/org/openjena/riot/

Author: andy
Date: Sat Sep  8 20:43:22 2012
New Revision: 1382362

URL: http://svn.apache.org/viewvc?rev=1382362&view=rev
Log:
Checking of language tags.

Modified:
    jena/trunk/jena-arq/src/main/java/org/openjena/atlas/lib/Chars.java
    jena/trunk/jena-arq/src/main/java/org/openjena/riot/LangTag.java
    jena/trunk/jena-arq/src/main/java/org/openjena/riot/lang/LangRDFXML.java
    jena/trunk/jena-arq/src/test/java/org/openjena/riot/TestChecker.java
    jena/trunk/jena-arq/src/test/java/org/openjena/riot/TestLangTag.java

Modified: jena/trunk/jena-arq/src/main/java/org/openjena/atlas/lib/Chars.java
URL: http://svn.apache.org/viewvc/jena/trunk/jena-arq/src/main/java/org/openjena/atlas/lib/Chars.java?rev=1382362&r1=1382361&r2=1382362&view=diff
==============================================================================
--- jena/trunk/jena-arq/src/main/java/org/openjena/atlas/lib/Chars.java (original)
+++ jena/trunk/jena-arq/src/main/java/org/openjena/atlas/lib/Chars.java Sat Sep  8 20:43:22 2012
@@ -214,6 +214,7 @@ public class Chars
     public static final char CH_HASH         = '#' ;
     public static final char CH_PLUS         = '+' ;
     public static final char CH_MINUS        = '-' ;
+    public static final char CH_DASH         = '-' ; // Alt name
     public static final char CH_SLASH        = '/' ;
     public static final char CH_RSLASH       = '\\' ;
     
@@ -249,6 +250,7 @@ public class Chars
     public static final byte B_HASH          = '#' ;
     public static final byte B_PLUS          = '+' ;
     public static final byte B_MINUS         = '-' ;
+    public static final byte B_DASH          = '-' ; // Alt name
     public static final byte B_SLASH         = '/' ;
     public static final byte B_RSLASH        = '\\' ;
     

Modified: jena/trunk/jena-arq/src/main/java/org/openjena/riot/LangTag.java
URL: http://svn.apache.org/viewvc/jena/trunk/jena-arq/src/main/java/org/openjena/riot/LangTag.java?rev=1382362&r1=1382361&r2=1382362&view=diff
==============================================================================
--- jena/trunk/jena-arq/src/main/java/org/openjena/riot/LangTag.java (original)
+++ jena/trunk/jena-arq/src/main/java/org/openjena/riot/LangTag.java Sat Sep  8 20:43:22 2012
@@ -21,6 +21,9 @@ package org.openjena.riot;
 import java.util.regex.Matcher ;
 import java.util.regex.Pattern ;
 
+import org.openjena.atlas.lib.Chars ;
+import org.openjena.riot.system.RiotChars ;
+
 
 /**
  * Language tags: support for parsing and canonicalization of case. 
@@ -141,6 +144,52 @@ public class LangTag
     private static Pattern patternPrivateuse    = Pattern.compile(privateuseRE) ;
     private static Pattern patternGrandfathered = Pattern.compile(grandfatheredRE) ; 
     
+    /** Validate - basic syntax check for a language tags: [a-zA-Z]+ ('-' [a-zA-Z0-9]+)* */
+    public static boolean check(String languageTag)
+    {
+        int len = languageTag.length() ;
+        int idx = 0;
+        boolean first = true ;
+        while ( idx < languageTag.length() )
+        {
+            int idx2 = checkPart(languageTag, idx, first) ;
+            first = false ;
+            if ( idx2 == idx )
+                // zero length part.
+                return false ;
+            idx = idx2 ;
+            if ( idx == len )
+                return true ;
+            if ( languageTag.charAt(idx) != Chars.CH_DASH )
+                return false ;
+            idx ++ ;
+            if ( idx == len)
+                // trailing DASH
+                return false ;
+        }
+        return true ;
+    }
+    
+    private static int checkPart(String languageTag, int idx, boolean leader)
+    {
+        for ( ; idx < languageTag.length() ; idx++)
+        {
+            int ch = languageTag.charAt(idx) ;
+            if ( leader )
+            {
+                if ( RiotChars.isA2Z(ch) ) continue ;
+            }
+            else
+            {
+                if ( RiotChars.isA2ZN(ch) ) continue ;
+            }
+            // Not acceptable.
+            return idx ;
+        }
+        // Off end.
+        return idx ;
+    }
+
     /** Parse a langtag string and return it's parts in canonical case.
      *  See constants for the array contents.  Parts not present cause a null
      *  in the return array. 

Modified: jena/trunk/jena-arq/src/main/java/org/openjena/riot/lang/LangRDFXML.java
URL: http://svn.apache.org/viewvc/jena/trunk/jena-arq/src/main/java/org/openjena/riot/lang/LangRDFXML.java?rev=1382362&r1=1382361&r2=1382362&view=diff
==============================================================================
--- jena/trunk/jena-arq/src/main/java/org/openjena/riot/lang/LangRDFXML.java (original)
+++ jena/trunk/jena-arq/src/main/java/org/openjena/riot/lang/LangRDFXML.java Sat Sep  8 20:43:22 2012
@@ -36,12 +36,8 @@ import com.hp.hpl.jena.datatypes.RDFData
 import com.hp.hpl.jena.datatypes.TypeMapper ;
 import com.hp.hpl.jena.graph.Node ;
 import com.hp.hpl.jena.graph.Triple ;
-import com.hp.hpl.jena.rdf.arp.ALiteral ;
-import com.hp.hpl.jena.rdf.arp.ARP ;
-import com.hp.hpl.jena.rdf.arp.AResource ;
-import com.hp.hpl.jena.rdf.arp.NamespaceHandler ;
-import com.hp.hpl.jena.rdf.arp.ParseException ;
-import com.hp.hpl.jena.rdf.arp.StatementHandler ;
+import com.hp.hpl.jena.rdf.arp.* ;
+import static com.hp.hpl.jena.rdf.arp.ARPErrorNumbers.* ;
 import com.hp.hpl.jena.rdf.arp.impl.ARPSaxErrorHandler ;
 import com.hp.hpl.jena.rdf.model.RDFErrorHandler ;
 
@@ -108,6 +104,14 @@ public class LangRDFXML implements LangR
     @Override
     public Lang getLang()   { return Lang.RDFXML ; }
 
+    public static boolean RiotUniformCompatibility = false ;
+    // Warnings in ARP that should bd errors to be compatible with
+    // non-XML-based languages.  e.g. language tags should be
+    // syntactically valid.
+    private static int[] additionalErrors = new int[] {
+        WARN_MALFORMED_XMLLANG
+        //, WARN_STRING_NOT_NORMAL_FORM_C
+    } ;
     
     @Override
     public void parse()
@@ -119,6 +123,15 @@ public class LangRDFXML implements LangR
         arp.getHandlers().setErrorHandler(rslt) ;
         arp.getHandlers().setNamespaceHandler(rslt) ;
         
+        if ( RiotUniformCompatibility )
+        {
+            ARPOptions options = arp.getOptions() ;
+            // Convert some warnings to errors for compatible behaviour for all parsers. 
+            for ( int code : additionalErrors )
+                options.setErrorMode(code, EM_FATAL) ;
+            arp.setOptionsWith(options) ;
+        }
+        
         try {
             if ( reader != null )
                 arp.load(reader, xmlBase);

Modified: jena/trunk/jena-arq/src/test/java/org/openjena/riot/TestChecker.java
URL: http://svn.apache.org/viewvc/jena/trunk/jena-arq/src/test/java/org/openjena/riot/TestChecker.java?rev=1382362&r1=1382361&r2=1382362&view=diff
==============================================================================
--- jena/trunk/jena-arq/src/test/java/org/openjena/riot/TestChecker.java (original)
+++ jena/trunk/jena-arq/src/test/java/org/openjena/riot/TestChecker.java Sat Sep  8 20:43:22 2012
@@ -18,7 +18,6 @@
 
 package org.openjena.riot;
 
-
 import org.junit.After ;
 import org.junit.Before ;
 import org.junit.Test ;

Modified: jena/trunk/jena-arq/src/test/java/org/openjena/riot/TestLangTag.java
URL: http://svn.apache.org/viewvc/jena/trunk/jena-arq/src/test/java/org/openjena/riot/TestLangTag.java?rev=1382362&r1=1382361&r2=1382362&view=diff
==============================================================================
--- jena/trunk/jena-arq/src/test/java/org/openjena/riot/TestLangTag.java (original)
+++ jena/trunk/jena-arq/src/test/java/org/openjena/riot/TestLangTag.java Sat Sep  8 20:43:22 2012
@@ -18,76 +18,67 @@
 
 package org.openjena.riot;
 
-import java.util.Arrays;
-import java.util.Collection;
+import org.junit.Test ;
+import org.openjena.atlas.junit.BaseTest ;
 
 
-import org.junit.Assert;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.junit.runners.Parameterized.Parameters;
-import org.openjena.riot.LangTag ;
-
-
-@RunWith(Parameterized.class)
-
-public class TestLangTag
+public class TestLangTag extends BaseTest
 {
+    @Test public void parse_01() 
+    { parseGood("en",                  "en",               "en", null, null, null, null) ; }
+
+    @Test public void parse_02()
+    { parseGood("en-uk",               "en-UK",            "en", null, "UK", null, null) ; }
+    
+    @Test public void parse_03()
+    { parseGood("es-419",              "es-419",           "es", null, "419", null, null) ; }
+    
+    @Test public void parse_04()
+    { parseGood("zh-Hant",             "zh-Hant",          "zh", "Hant", null, null, null) ; }
+    
+    @Test public void parse_05()
+    { parseGood("sr-Latn-CS",          "sr-Latn-CS",       "sr", "Latn", "CS", null, null) ; }
     
+    @Test public void parse_06()
+    { parseGood("sl-nedis",            "sl-nedis",         "sl", null, null, "nedis", null) ; }
     
-    @Parameters public static Collection<Object[]> data()
+    @Test public void parse_07()
+    { parseGood("sl-IT-nedis",         "sl-IT-nedis",      "sl", null, "IT", "nedis", null) ; }
     
+    @Test public void parse_08()
+    { parseGood("sl-Latn-IT-nedis",    "sl-Latn-IT-nedis", "sl", "Latn", "IT", "nedis", null) ; }
+    
+    @Test public void parse_09()
+    { parseGood("de-CH-x-Phonebk",     "de-CH-x-Phonebk",  "de", null, "CH", null, "x-Phonebk") ; }
+    
+    @Test public void parse_10()
+    { parseGood("zh-cn-a-myExt-x-private", "zh-CN-a-myExt-x-private", 
+                                      "zh", null, "CN", null, "a-myExt-x-private") ; }
+    
+    @Test public void parse_bad_01() { parseBad("i18n") ; }
+    @Test public void parse_bad_02() { parseBad("i@n") ; }
+    @Test public void parse_bad_03() { parseBad("123-abc") ; }
+    @Test public void parse_bad_04() { parseBad("en-") ; }
+    
+    private static void parseGood(String input, String ex_output, String... ex_parts )
     {
-        return Arrays.asList(new Object[][] {
-            // input, language, script, region, variant, extension 
-            //{"en", new String[]{"en", "junk", null, null }},
-            
-            {"en",                  "en",               new String[]{"en", null, null, null, null}},
-            {"en-uk",               "en-UK",            new String[]{"en", null, "UK", null, null}},
-            {"es-419",              "es-419",           new String[]{"es", null, "419", null, null}},
-            {"zh-Hant",             "zh-Hant",          new String[]{"zh", "Hant", null, null, null}},
-            {"sr-Latn-CS",          "sr-Latn-CS",       new String[]{"sr", "Latn", "CS", null, null}},
-            {"sl-nedis",            "sl-nedis",         new String[]{"sl", null, null, "nedis", null}},
-            {"sl-IT-nedis",         "sl-IT-nedis",      new String[]{"sl", null, "IT", "nedis", null}},
-            {"sl-Latn-IT-nedis",    "sl-Latn-IT-nedis", new String[]{"sl", "Latn", "IT", "nedis", null}},
-            {"de-CH-x-Phonebk",     "de-CH-x-Phonebk",  new String[]{"de", null, "CH", null, "x-Phonebk"}},
-            {"zh-cn-a-myExt-x-private", "zh-CN-a-myExt-x-private", new String[]{"zh", null, "CN", null, "a-myExt-x-private"}},
-
-            {"12345", "12345", null},
-//            
-//            {"en", "en", new String[]{"en", null, null, null, null}},
-//            {"en-uk", "en-UK", new String[]{"en", null, "UK", null, null}},
-//            {"es-419", "es-419", new String[]{"es", null, "419", null, null}},
-//            {"zh-hant", "zh-Hant", new String[]{"zh", "Hant", null, null, null}},
-//            {"sr-latn-cs", "sr-Latn-CS", new String[]{"sr", "Latn", "CS", null, null}},
-//            {"sl-nedis", "sl-nedis", new String[]{"sl", null, null, "nedis", null}},
-//            {"sl-IT-nedis", "sl-IT-nedis", new String[]{"sl", null, "IT", "nedis", null}},
-//            {"SL-latn-it-Nedis", "sl-Latn-IT-nedis", new String[]{"sl", "Latn", "IT", "nedis", null}},
-//            {"12345", "12345", (String[])null},
-            });
+      String[] parts = LangTag.parse(input) ;
+      assertArrayEquals(ex_parts, parts) ;
+      
+      String output = LangTag.canonical(input) ;
+      assertEquals(ex_output, output) ;
+      
+      assertTrue(LangTag.check(input)) ;
     }
 
-    private String input ;
-    private String[] parts ;
-    private String output ;
-    
-    public TestLangTag(String input, String output, String[] parts)
-    { 
-        this.input = input ;
-        this.output = output ;
-        this.parts = parts ;
-    }
-   
-    @Test
-    public void verify()
+    
+    private static void parseBad(String input)
     {
-        //System.out.println(input+" ==> "+output) ;
-        
         String[] parts = LangTag.parse(input) ;
+        assertNull(parts) ;
         String output = LangTag.canonical(input) ;
-        Assert.assertArrayEquals(this.parts, parts) ;
-        Assert.assertEquals(this.output, output) ;
+        assertEquals(input, output) ;
+        assertFalse(LangTag.check(input)) ;
     }
 
 }