You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by an...@apache.org on 2012/09/08 22:43:22 UTC
svn commit: r1382362 - in /jena/trunk/jena-arq/src:
main/java/org/openjena/atlas/lib/ main/java/org/openjena/riot/
main/java/org/openjena/riot/lang/ test/java/org/openjena/riot/
Author: andy
Date: Sat Sep 8 20:43:22 2012
New Revision: 1382362
URL: http://svn.apache.org/viewvc?rev=1382362&view=rev
Log:
Checking of language tags.
Modified:
jena/trunk/jena-arq/src/main/java/org/openjena/atlas/lib/Chars.java
jena/trunk/jena-arq/src/main/java/org/openjena/riot/LangTag.java
jena/trunk/jena-arq/src/main/java/org/openjena/riot/lang/LangRDFXML.java
jena/trunk/jena-arq/src/test/java/org/openjena/riot/TestChecker.java
jena/trunk/jena-arq/src/test/java/org/openjena/riot/TestLangTag.java
Modified: jena/trunk/jena-arq/src/main/java/org/openjena/atlas/lib/Chars.java
URL: http://svn.apache.org/viewvc/jena/trunk/jena-arq/src/main/java/org/openjena/atlas/lib/Chars.java?rev=1382362&r1=1382361&r2=1382362&view=diff
==============================================================================
--- jena/trunk/jena-arq/src/main/java/org/openjena/atlas/lib/Chars.java (original)
+++ jena/trunk/jena-arq/src/main/java/org/openjena/atlas/lib/Chars.java Sat Sep 8 20:43:22 2012
@@ -214,6 +214,7 @@ public class Chars
public static final char CH_HASH = '#' ;
public static final char CH_PLUS = '+' ;
public static final char CH_MINUS = '-' ;
+ public static final char CH_DASH = '-' ; // Alt name
public static final char CH_SLASH = '/' ;
public static final char CH_RSLASH = '\\' ;
@@ -249,6 +250,7 @@ public class Chars
public static final byte B_HASH = '#' ;
public static final byte B_PLUS = '+' ;
public static final byte B_MINUS = '-' ;
+ public static final byte B_DASH = '-' ; // Alt name
public static final byte B_SLASH = '/' ;
public static final byte B_RSLASH = '\\' ;
Modified: jena/trunk/jena-arq/src/main/java/org/openjena/riot/LangTag.java
URL: http://svn.apache.org/viewvc/jena/trunk/jena-arq/src/main/java/org/openjena/riot/LangTag.java?rev=1382362&r1=1382361&r2=1382362&view=diff
==============================================================================
--- jena/trunk/jena-arq/src/main/java/org/openjena/riot/LangTag.java (original)
+++ jena/trunk/jena-arq/src/main/java/org/openjena/riot/LangTag.java Sat Sep 8 20:43:22 2012
@@ -21,6 +21,9 @@ package org.openjena.riot;
import java.util.regex.Matcher ;
import java.util.regex.Pattern ;
+import org.openjena.atlas.lib.Chars ;
+import org.openjena.riot.system.RiotChars ;
+
/**
* Language tags: support for parsing and canonicalization of case.
@@ -141,6 +144,52 @@ public class LangTag
private static Pattern patternPrivateuse = Pattern.compile(privateuseRE) ;
private static Pattern patternGrandfathered = Pattern.compile(grandfatheredRE) ;
+ /** Validate - basic syntax check for a language tags: [a-zA-Z]+ ('-' [a-zA-Z0-9]+)* */
+ public static boolean check(String languageTag)
+ {
+ int len = languageTag.length() ;
+ int idx = 0;
+ boolean first = true ;
+ while ( idx < languageTag.length() )
+ {
+ int idx2 = checkPart(languageTag, idx, first) ;
+ first = false ;
+ if ( idx2 == idx )
+ // zero length part.
+ return false ;
+ idx = idx2 ;
+ if ( idx == len )
+ return true ;
+ if ( languageTag.charAt(idx) != Chars.CH_DASH )
+ return false ;
+ idx ++ ;
+ if ( idx == len)
+ // trailing DASH
+ return false ;
+ }
+ return true ;
+ }
+
+ private static int checkPart(String languageTag, int idx, boolean leader)
+ {
+ for ( ; idx < languageTag.length() ; idx++)
+ {
+ int ch = languageTag.charAt(idx) ;
+ if ( leader )
+ {
+ if ( RiotChars.isA2Z(ch) ) continue ;
+ }
+ else
+ {
+ if ( RiotChars.isA2ZN(ch) ) continue ;
+ }
+ // Not acceptable.
+ return idx ;
+ }
+ // Off end.
+ return idx ;
+ }
+
/** Parse a langtag string and return it's parts in canonical case.
* See constants for the array contents. Parts not present cause a null
* in the return array.
Modified: jena/trunk/jena-arq/src/main/java/org/openjena/riot/lang/LangRDFXML.java
URL: http://svn.apache.org/viewvc/jena/trunk/jena-arq/src/main/java/org/openjena/riot/lang/LangRDFXML.java?rev=1382362&r1=1382361&r2=1382362&view=diff
==============================================================================
--- jena/trunk/jena-arq/src/main/java/org/openjena/riot/lang/LangRDFXML.java (original)
+++ jena/trunk/jena-arq/src/main/java/org/openjena/riot/lang/LangRDFXML.java Sat Sep 8 20:43:22 2012
@@ -36,12 +36,8 @@ import com.hp.hpl.jena.datatypes.RDFData
import com.hp.hpl.jena.datatypes.TypeMapper ;
import com.hp.hpl.jena.graph.Node ;
import com.hp.hpl.jena.graph.Triple ;
-import com.hp.hpl.jena.rdf.arp.ALiteral ;
-import com.hp.hpl.jena.rdf.arp.ARP ;
-import com.hp.hpl.jena.rdf.arp.AResource ;
-import com.hp.hpl.jena.rdf.arp.NamespaceHandler ;
-import com.hp.hpl.jena.rdf.arp.ParseException ;
-import com.hp.hpl.jena.rdf.arp.StatementHandler ;
+import com.hp.hpl.jena.rdf.arp.* ;
+import static com.hp.hpl.jena.rdf.arp.ARPErrorNumbers.* ;
import com.hp.hpl.jena.rdf.arp.impl.ARPSaxErrorHandler ;
import com.hp.hpl.jena.rdf.model.RDFErrorHandler ;
@@ -108,6 +104,14 @@ public class LangRDFXML implements LangR
@Override
public Lang getLang() { return Lang.RDFXML ; }
+ public static boolean RiotUniformCompatibility = false ;
+ // Warnings in ARP that should bd errors to be compatible with
+ // non-XML-based languages. e.g. language tags should be
+ // syntactically valid.
+ private static int[] additionalErrors = new int[] {
+ WARN_MALFORMED_XMLLANG
+ //, WARN_STRING_NOT_NORMAL_FORM_C
+ } ;
@Override
public void parse()
@@ -119,6 +123,15 @@ public class LangRDFXML implements LangR
arp.getHandlers().setErrorHandler(rslt) ;
arp.getHandlers().setNamespaceHandler(rslt) ;
+ if ( RiotUniformCompatibility )
+ {
+ ARPOptions options = arp.getOptions() ;
+ // Convert some warnings to errors for compatible behaviour for all parsers.
+ for ( int code : additionalErrors )
+ options.setErrorMode(code, EM_FATAL) ;
+ arp.setOptionsWith(options) ;
+ }
+
try {
if ( reader != null )
arp.load(reader, xmlBase);
Modified: jena/trunk/jena-arq/src/test/java/org/openjena/riot/TestChecker.java
URL: http://svn.apache.org/viewvc/jena/trunk/jena-arq/src/test/java/org/openjena/riot/TestChecker.java?rev=1382362&r1=1382361&r2=1382362&view=diff
==============================================================================
--- jena/trunk/jena-arq/src/test/java/org/openjena/riot/TestChecker.java (original)
+++ jena/trunk/jena-arq/src/test/java/org/openjena/riot/TestChecker.java Sat Sep 8 20:43:22 2012
@@ -18,7 +18,6 @@
package org.openjena.riot;
-
import org.junit.After ;
import org.junit.Before ;
import org.junit.Test ;
Modified: jena/trunk/jena-arq/src/test/java/org/openjena/riot/TestLangTag.java
URL: http://svn.apache.org/viewvc/jena/trunk/jena-arq/src/test/java/org/openjena/riot/TestLangTag.java?rev=1382362&r1=1382361&r2=1382362&view=diff
==============================================================================
--- jena/trunk/jena-arq/src/test/java/org/openjena/riot/TestLangTag.java (original)
+++ jena/trunk/jena-arq/src/test/java/org/openjena/riot/TestLangTag.java Sat Sep 8 20:43:22 2012
@@ -18,76 +18,67 @@
package org.openjena.riot;
-import java.util.Arrays;
-import java.util.Collection;
+import org.junit.Test ;
+import org.openjena.atlas.junit.BaseTest ;
-import org.junit.Assert;
-import org.junit.Test;
-import org.junit.runner.RunWith;
-import org.junit.runners.Parameterized;
-import org.junit.runners.Parameterized.Parameters;
-import org.openjena.riot.LangTag ;
-
-
-@RunWith(Parameterized.class)
-
-public class TestLangTag
+public class TestLangTag extends BaseTest
{
+ @Test public void parse_01()
+ { parseGood("en", "en", "en", null, null, null, null) ; }
+
+ @Test public void parse_02()
+ { parseGood("en-uk", "en-UK", "en", null, "UK", null, null) ; }
+
+ @Test public void parse_03()
+ { parseGood("es-419", "es-419", "es", null, "419", null, null) ; }
+
+ @Test public void parse_04()
+ { parseGood("zh-Hant", "zh-Hant", "zh", "Hant", null, null, null) ; }
+
+ @Test public void parse_05()
+ { parseGood("sr-Latn-CS", "sr-Latn-CS", "sr", "Latn", "CS", null, null) ; }
+ @Test public void parse_06()
+ { parseGood("sl-nedis", "sl-nedis", "sl", null, null, "nedis", null) ; }
- @Parameters public static Collection<Object[]> data()
+ @Test public void parse_07()
+ { parseGood("sl-IT-nedis", "sl-IT-nedis", "sl", null, "IT", "nedis", null) ; }
+ @Test public void parse_08()
+ { parseGood("sl-Latn-IT-nedis", "sl-Latn-IT-nedis", "sl", "Latn", "IT", "nedis", null) ; }
+
+ @Test public void parse_09()
+ { parseGood("de-CH-x-Phonebk", "de-CH-x-Phonebk", "de", null, "CH", null, "x-Phonebk") ; }
+
+ @Test public void parse_10()
+ { parseGood("zh-cn-a-myExt-x-private", "zh-CN-a-myExt-x-private",
+ "zh", null, "CN", null, "a-myExt-x-private") ; }
+
+ @Test public void parse_bad_01() { parseBad("i18n") ; }
+ @Test public void parse_bad_02() { parseBad("i@n") ; }
+ @Test public void parse_bad_03() { parseBad("123-abc") ; }
+ @Test public void parse_bad_04() { parseBad("en-") ; }
+
+ private static void parseGood(String input, String ex_output, String... ex_parts )
{
- return Arrays.asList(new Object[][] {
- // input, language, script, region, variant, extension
- //{"en", new String[]{"en", "junk", null, null }},
-
- {"en", "en", new String[]{"en", null, null, null, null}},
- {"en-uk", "en-UK", new String[]{"en", null, "UK", null, null}},
- {"es-419", "es-419", new String[]{"es", null, "419", null, null}},
- {"zh-Hant", "zh-Hant", new String[]{"zh", "Hant", null, null, null}},
- {"sr-Latn-CS", "sr-Latn-CS", new String[]{"sr", "Latn", "CS", null, null}},
- {"sl-nedis", "sl-nedis", new String[]{"sl", null, null, "nedis", null}},
- {"sl-IT-nedis", "sl-IT-nedis", new String[]{"sl", null, "IT", "nedis", null}},
- {"sl-Latn-IT-nedis", "sl-Latn-IT-nedis", new String[]{"sl", "Latn", "IT", "nedis", null}},
- {"de-CH-x-Phonebk", "de-CH-x-Phonebk", new String[]{"de", null, "CH", null, "x-Phonebk"}},
- {"zh-cn-a-myExt-x-private", "zh-CN-a-myExt-x-private", new String[]{"zh", null, "CN", null, "a-myExt-x-private"}},
-
- {"12345", "12345", null},
-//
-// {"en", "en", new String[]{"en", null, null, null, null}},
-// {"en-uk", "en-UK", new String[]{"en", null, "UK", null, null}},
-// {"es-419", "es-419", new String[]{"es", null, "419", null, null}},
-// {"zh-hant", "zh-Hant", new String[]{"zh", "Hant", null, null, null}},
-// {"sr-latn-cs", "sr-Latn-CS", new String[]{"sr", "Latn", "CS", null, null}},
-// {"sl-nedis", "sl-nedis", new String[]{"sl", null, null, "nedis", null}},
-// {"sl-IT-nedis", "sl-IT-nedis", new String[]{"sl", null, "IT", "nedis", null}},
-// {"SL-latn-it-Nedis", "sl-Latn-IT-nedis", new String[]{"sl", "Latn", "IT", "nedis", null}},
-// {"12345", "12345", (String[])null},
- });
+ String[] parts = LangTag.parse(input) ;
+ assertArrayEquals(ex_parts, parts) ;
+
+ String output = LangTag.canonical(input) ;
+ assertEquals(ex_output, output) ;
+
+ assertTrue(LangTag.check(input)) ;
}
- private String input ;
- private String[] parts ;
- private String output ;
-
- public TestLangTag(String input, String output, String[] parts)
- {
- this.input = input ;
- this.output = output ;
- this.parts = parts ;
- }
-
- @Test
- public void verify()
+
+ private static void parseBad(String input)
{
- //System.out.println(input+" ==> "+output) ;
-
String[] parts = LangTag.parse(input) ;
+ assertNull(parts) ;
String output = LangTag.canonical(input) ;
- Assert.assertArrayEquals(this.parts, parts) ;
- Assert.assertEquals(this.output, output) ;
+ assertEquals(input, output) ;
+ assertFalse(LangTag.check(input)) ;
}
}