You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by an...@apache.org on 2014/01/21 21:33:07 UTC
svn commit: r1560173 - in /jena/trunk/jena-arq/src:
main/java/org/apache/jena/riot/web/LangTag.java
test/java/org/apache/jena/riot/web/TestLangTag.java
Author: andy
Date: Tue Jan 21 20:33:06 2014
New Revision: 1560173
URL: http://svn.apache.org/r1560173
Log:
Tidy up code and comments.
Add tests for canonical form.
Modified:
jena/trunk/jena-arq/src/main/java/org/apache/jena/riot/web/LangTag.java
jena/trunk/jena-arq/src/test/java/org/apache/jena/riot/web/TestLangTag.java
Modified: jena/trunk/jena-arq/src/main/java/org/apache/jena/riot/web/LangTag.java
URL: http://svn.apache.org/viewvc/jena/trunk/jena-arq/src/main/java/org/apache/jena/riot/web/LangTag.java?rev=1560173&r1=1560172&r2=1560173&view=diff
==============================================================================
--- jena/trunk/jena-arq/src/main/java/org/apache/jena/riot/web/LangTag.java (original)
+++ jena/trunk/jena-arq/src/main/java/org/apache/jena/riot/web/LangTag.java Tue Jan 21 20:33:06 2014
@@ -16,7 +16,7 @@
* limitations under the License.
*/
-package org.apache.jena.riot.web;
+package org.apache.jena.riot.web ;
import java.util.Locale ;
import java.util.regex.Matcher ;
@@ -25,134 +25,130 @@ import java.util.regex.Pattern ;
import org.apache.jena.atlas.lib.Chars ;
import org.apache.jena.riot.system.RiotChars ;
-
/**
- * Language tags: support for parsing and canonicalization of case.
- * Grandfathered forms ("i-") are left untouched.
- * Unsupported or syntactically illegal forms are handled in
- * canonicalization by doing nothing.
+ * Language tags: support for parsing and canonicalization of case.
+ * Grandfathered forms ("i-") are left untouched. Unsupported or syntactically
+ * illegal forms are handled in canonicalization by doing nothing.
* <ul>
- * <li>Language tags syntax: <a href="http://www.ietf.org/rfc/rfc4646.txt">RFC 4646</a></li>
- * <li>Matching Language tags: <a href="http://www.ietf.org/rfc/rfc4647.txt">RFC 4647</a></li>
- * <li>Language tags syntax: <a href="http://www.ietf.org/rfc/rfc5646.txt">RFC 5646</a></li>
+ * <li>Language tags syntax: <a href="http://www.ietf.org/rfc/rfc4646.txt">RFC
+ * 4646</a></li>
+ * <li>Matching Language tags: <a href="http://www.ietf.org/rfc/rfc4647.txt">RFC
+ * 4647</a></li>
+ * <li>Language tags syntax: <a href="http://www.ietf.org/rfc/rfc5646.txt">RFC
+ * 5646</a></li>
* </ul>
*/
-
-public class LangTag
-{
- // See also http://tools.ietf.org/html/rfc5646 - irregular lang tags
-
+
+public class LangTag {
+ // Valid language tag, not ireegular nor grandfathered.
/** Index of the language part */
- public static final int idxLanguage = 0 ;
- /** Index of the script part */
- public static final int idxScript = 1 ;
+ public static final int idxLanguage = 0 ;
+ /** Index of the script part */
+ public static final int idxScript = 1 ;
/** Index of the region part */
- public static final int idxRegion = 2 ;
+ public static final int idxRegion = 2 ;
/** Index of the variant part */
- public static final int idxVariant = 3 ;
+ public static final int idxVariant = 3 ;
/** Index of all extensions */
- public static final int idxExtension = 4 ;
-
- private static final int partsLength = 5 ;
-
- private LangTag(){}
-
- // ABNF is defined in http://www.ietf.org/rfc/rfc4234.txt
+ public static final int idxExtension = 4 ;
+
+ private static final int partsLength = 5 ;
+
+ private LangTag() {}
+
+ // Defined by BCP 47 which is currently RFC5646 which obsoletes RFC4646.
+
+ // Canonical forms:
+ /*
+ * RFC 4646 In this format, all non-initial two-letter subtags are
+ * uppercase, all non-initial four-letter subtags are titlecase, and all
+ * other subtags are lowercase.
+ */
+ /*
+ * RFC 5646 An implementation can reproduce this format without accessing
+ * the registry as follows. All subtags, including extension and private use
+ * subtags, use lowercase letters with two exceptions: two-letter and
+ * four-letter subtags that neither appear at the start of the tag nor occur
+ * after singletons. Such two-letter subtags are all uppercase (as in the
+ * tags "en-CA-x-ca" or "sgn-BE-FR") and four- letter subtags are titlecase
+ * (as in the tag "az-Latn-x-latn").
+ */
/*
- In this format, all non-initial two-letter subtags are uppercase, all
- non-initial four-letter subtags are titlecase, and all other subtags
- are lowercase.
- */
-
-
- /*
- * <li>ABNF definition: <a href="http://www.ietf.org/rfc/rfc4234.txt">RFC 4234</a></li>
-
- Language-Tag = langtag
- / privateuse ; private use tag
- / grandfathered ; grandfathered registrations
-
- langtag = (language
- ["-" script]
- ["-" region]
- *("-" variant)
- *("-" extension)
- ["-" privateuse])
-
- language = (2*3ALPHA [ extlang ]) ; shortest ISO 639 code
- / 4ALPHA ; reserved for future use
- / 5*8ALPHA ; registered language subtag
-
- extlang = *3("-" 3ALPHA) ; reserved for future use
-
- script = 4ALPHA ; ISO 15924 code
-
- region = 2ALPHA ; ISO 3166 code
- / 3DIGIT ; UN M.49 code
-
- variant = 5*8alphanum ; registered variants
- / (DIGIT 3alphanum)
-
- extension = singleton 1*("-" (2*8alphanum))
-
- singleton = %x41-57 / %x59-5A / %x61-77 / %x79-7A / DIGIT
- ; "a"-"w" / "y"-"z" / "A"-"W" / "Y"-"Z" / "0"-"9"
- ; Single letters: x/X is reserved for private use
-
- privateuse = ("x"/"X") 1*("-" (1*8alphanum))
-
- grandfathered = 1*3ALPHA 1*2("-" (2*8alphanum))
- ; grandfathered registration
- ; Note: i is the only singleton
- ; that starts a grandfathered tag
-
- alphanum = (ALPHA / DIGIT) ; letters and numbers
-
-
- */
-
- private static final String languageRE_1 = "(?:[a-zA-Z]{2,3}(?:-[a-zA-Z]{3}){0,3})" ; //including extlang
- private static final String languageRE_2 = "[a-zA-Z]{4}" ;
- private static final String languageRE_3 = "[a-zA-Z]{5,8}" ;
- private static final String language = "(?:"+languageRE_1+"|"+languageRE_2+"|"+languageRE_3+")" ;
-
- private static final String script = "[a-zA-Z]{4}" ;
- private static final String region = "[a-zA-Z]{2}|[0-9]{3}" ;
- private static final String variant = "[a-zA-Z0-9]{5,8}" ;
- private static final String extension1 = "(?:[a-zA-Z0-9]-[a-zA-Z0-9]{2,8})" ;
- private static final String extension = extension1+"(?:-"+extension1+")*" ;
-
-// private static final String singleton = null ;
-// private static final String privateuse = null ;
-// private static final String grandfathered = null ;
-
- private static final String langtag = String.format("^(%s)(?:-(%s))?(?:-(%s))?(?:-(%s))?(?:-(%s))?$"
- ,language
- ,script
- ,region
- ,variant
- ,extension
- ) ;
-
+ * ABNF definition: <a href="http://www.ietf.org/rfc/rfc4234.txt">RFC
+ * 4234</a>
+ *
+ * Language-Tag = langtag / privateuse ; private use tag / grandfathered ;
+ * grandfathered registrations
+ *
+ * langtag = (language ["-" script] ["-" region]("-" variant)("-" extension)
+ * ["-" privateuse])
+ *
+ * language = (2*3ALPHA [ extlang ]) ; shortest ISO 639 code / 4ALPHA ;
+ * reserved for future use / 5*8ALPHA ; registered language subtag
+ *
+ * extlang = *3("-" 3ALPHA) ; reserved for future use
+ *
+ * script = 4ALPHA ; ISO 15924 code
+ *
+ * region = 2ALPHA ; ISO 3166 code / 3DIGIT ; UN M.49 code
+ *
+ * variant = 5*8alphanum ; registered variants / (DIGIT 3alphanum)
+ *
+ * extension = singleton 1*("-" (2*8alphanum))
+ *
+ * singleton = %x41-57 / %x59-5A / %x61-77 / %x79-7A / DIGIT ; "a"-"w" /
+ * "y"-"z" / "A"-"W" / "Y"-"Z" / "0"-"9" ; Single letters: x/X is reserved
+ * for private use
+ *
+ * privateuse = ("x"/"X") 1*("-" (1*8alphanum))
+ *
+ * grandfathered = 1*3ALPHA 1*2("-" (2*8alphanum)) ; grandfathered
+ * registration ; Note: i is the only singleton ; that starts a
+ * grandfathered tag
+ *
+ * alphanum = (ALPHA / DIGIT) ; letters and numbers
+ */
+
+ private static final String languageRE_1 = "(?:[a-zA-Z]{2,3}(?:-[a-zA-Z]{3}){0,3})" ; // including
+ // extlang
+ private static final String languageRE_2 = "[a-zA-Z]{4}" ;
+ private static final String languageRE_3 = "[a-zA-Z]{5,8}" ;
+ private static final String language = languageRE_1 + "|" + languageRE_2 + "|" + languageRE_3 ;
+
+ private static final String script = "[a-zA-Z]{4}" ;
+ private static final String region = "[a-zA-Z]{2}|[0-9]{3}" ;
+ private static final String variant = "[a-zA-Z0-9]{5,8}" ;
+ private static final String extension1 = "(?:[a-zA-Z0-9]-[a-zA-Z0-9]{2,8})" ;
+ private static final String extension = extension1 + "(?:-" + extension1 + ")*" ;
+
+ // private static final String singleton = null ;
+ // private static final String privateuse = null ;
+ // private static final String grandfathered = null ;
+
+ private static final String langtag = String.format("^(%s)(?:-(%s))?(?:-(%s))?(?:-(%s))?(?:-(%s))?$",
+ language, script, region, variant, extension) ;
+
// Private use forms "x-"
- private static final String privateuseRE = "^[xX](-[a-zA-Z0-9]{1,8})*$" ;
- // In general, this can look like a langtag but there are no registered forms that do so.
+ private static final String privateuseRE = "^[xX](-[a-zA-Z0-9]{1,8})*$" ;
+ // In general, this can look like a langtag but there are no registered
+ // forms that do so.
// This is for the "i-" forms only.
- private static final String grandfatheredRE = "i(?:-[a-zA-Z0-9]{2,8}){1,2}" ;
-
- private static Pattern pattern = Pattern.compile(langtag) ;
- private static Pattern patternPrivateuse = Pattern.compile(privateuseRE) ;
- private static Pattern patternGrandfathered = Pattern.compile(grandfatheredRE) ;
-
- /** Validate - basic syntax check for a language tags: [a-zA-Z]+ ('-' [a-zA-Z0-9]+)* */
- public static boolean check(String languageTag)
- {
+ private static final String grandfatheredRE = "i(?:-[a-zA-Z0-9]{2,8}){1,2}" ;
+
+ private static Pattern pattern = Pattern.compile(langtag) ;
+ private static Pattern patternPrivateuse = Pattern.compile(privateuseRE) ;
+ private static Pattern patternGrandfathered = Pattern.compile(grandfatheredRE) ;
+
+ /**
+ * Validate - basic syntax check for a language tags: [a-zA-Z]+ ('-'
+ * [a-zA-Z0-9]+)*
+ */
+ public static boolean check(String languageTag) {
int len = languageTag.length() ;
- int idx = 0;
+ int idx = 0 ;
boolean first = true ;
- while ( idx < languageTag.length() )
- {
+ while (idx < languageTag.length()) {
int idx2 = checkPart(languageTag, idx, first) ;
first = false ;
if ( idx2 == idx )
@@ -163,26 +159,23 @@ public class LangTag
return true ;
if ( languageTag.charAt(idx) != Chars.CH_DASH )
return false ;
- idx ++ ;
- if ( idx == len)
+ idx++ ;
+ if ( idx == len )
// trailing DASH
return false ;
}
return true ;
}
-
- private static int checkPart(String languageTag, int idx, boolean leader)
- {
- for ( ; idx < languageTag.length() ; idx++)
- {
+
+ private static int checkPart(String languageTag, int idx, boolean leader) {
+ for (; idx < languageTag.length(); idx++) {
int ch = languageTag.charAt(idx) ;
- if ( leader )
- {
- if ( RiotChars.isA2Z(ch) ) continue ;
- }
- else
- {
- if ( RiotChars.isA2ZN(ch) ) continue ;
+ if ( leader ) {
+ if ( RiotChars.isA2Z(ch) )
+ continue ;
+ } else {
+ if ( RiotChars.isA2ZN(ch) )
+ continue ;
}
// Not acceptable.
return idx ;
@@ -191,168 +184,125 @@ public class LangTag
return idx ;
}
- /** Parse a langtag string and return it's parts in canonical case.
- * See constants for the array contents. Parts not present cause a null
- * in the return array.
- * @return Langtag parts, or null if the input string does not poarse as a lang tag.
+ /**
+ * Parse a langtag string and return it's parts in canonical case. See
+ * constants for the array contents. Parts not present cause a null in the
+ * return array.
+ *
+ * @return Langtag parts, or null if the input string does not poarse as a
+ * lang tag.
*/
- public static String[] parse(String languageTag)
- {
+ public static String[] parse(String languageTag) {
String[] parts = new String[partsLength] ;
+
+ String x = pattern.toString() ;
+
+ Pattern.compile(languageRE_1) ;
+
Matcher m = pattern.matcher(languageTag) ;
- if ( ! m.find() )
- {
+ if ( !m.find() ) {
m = patternPrivateuse.matcher(languageTag) ;
- if ( m.find() )
- {
+ if ( m.find() ) {
// Place in the "extension" part
parts[idxExtension] = m.group(0) ;
return parts ;
}
-
+
m = patternGrandfathered.matcher(languageTag) ;
-
- if ( m.find() )
- {
+
+ if ( m.find() ) {
// Place in the "extension" part
parts[idxExtension] = m.group(0) ;
return parts ;
}
-
+
// Give up.
return null ;
}
-
+
int gc = m.groupCount() ;
- for ( int i = 0 ; i < gc ; i++ )
- parts[i] = m.group(i+1) ;
-
- parts[idxLanguage] = lowercase(parts[idxLanguage]) ;
- parts[idxScript] = strcase(parts[idxScript]) ;
- parts[idxRegion] = strcase(parts[idxRegion]) ;
- parts[idxVariant] = strcase(parts[idxVariant]) ;
- //parts[idxExtension] = strcase(parts[idxExtension]) ; // Leave extensions alone.
+ for (int i = 0; i < gc; i++)
+ parts[i] = m.group(i + 1) ;
+
+ parts[idxLanguage] = lowercase(parts[idxLanguage]) ;
+ parts[idxScript] = strcase(parts[idxScript]) ;
+ parts[idxRegion] = strcase(parts[idxRegion]) ;
+ parts[idxVariant] = strcase(parts[idxVariant]) ;
+ // parts[idxExtension] = strcase(parts[idxExtension]) ; // Leave
+ // extensions alone.
return parts ;
}
/** Canonicalize with the rules of RFC 4646 */
- public static String canonical(String str)
- {
+ public static String canonical(String str) {
if ( str == null )
return null ;
String[] parts = parse(str) ;
String x = canonical(parts) ;
- if ( x == null )
+ if ( x == null ) {
+ // Could try to apply the rule case-seeting rules
+ // even through it's not a conforming langtag.
return str ;
+ }
return x ;
}
-
- /** Canonicalize with the rules of RFC 4646
- "In this format, all non-initial two-letter subtags are uppercase, all
- non-initial four-letter subtags are titlecase, and all other subtags
- are lowercase."
- In addition, leave extensions unchanged.
+
+ /**
+ * Canonicalize with the rules of RFC 4646 "In this format, all non-initial
+ * two-letter subtags are uppercase, all non-initial four-letter subtags are
+ * titlecase, and all other subtags are lowercase." In addition, leave
+ * extensions unchanged.
*/
- public static String canonical(String[] parts)
- {
+ public static String canonical(String[] parts) {
+ // We canonicalised parts on parsing.
+ // RFC 5646 is slightly different.
if ( parts == null )
return null ;
-
- if ( parts[0] == null )
- {
+
+ if ( parts[0] == null ) {
// Grandfathered
return parts[idxExtension] ;
}
StringBuilder sb = new StringBuilder() ;
sb.append(parts[0]) ;
- for ( int i = 1 ; i < parts.length ; i++ )
- {
- if ( parts[i] != null )
- {
+ for (int i = 1; i < parts.length; i++) {
+ if ( parts[i] != null ) {
sb.append("-") ;
sb.append(parts[i]) ;
}
}
- return sb.toString();
+ return sb.toString() ;
}
-
- private static String strcase(String string)
- {
- if ( string == null ) return null ;
- if ( string.length() == 2 ) return uppercase(string) ;
- if ( string.length() == 4 ) return titlecase(string) ;
+
+ private static String strcase(String string) {
+ if ( string == null )
+ return null ;
+ if ( string.length() == 2 )
+ return uppercase(string) ;
+ if ( string.length() == 4 )
+ return titlecase(string) ;
return lowercase(string) ;
}
- private static String lowercase(String string)
- {
- if ( string == null ) return null ;
+ private static String lowercase(String string) {
+ if ( string == null )
+ return null ;
return string.toLowerCase(Locale.ROOT) ;
}
- private static String uppercase(String string)
- {
- if ( string == null ) return null ;
+ private static String uppercase(String string) {
+ if ( string == null )
+ return null ;
return string.toUpperCase(Locale.ROOT) ;
}
- private static String titlecase(String string)
- {
- if ( string == null ) return null ;
+ private static String titlecase(String string) {
+ if ( string == null )
+ return null ;
char ch1 = string.charAt(0) ;
ch1 = Character.toUpperCase(ch1) ;
string = lowercase(string.substring(1)) ;
- return ch1+string ;
- }
-
- // ----------
-
- public static void main(String ... args) //throws IOException
- {
- // Test data.
- String[] tags = {
- "en", "en-uk", "es-419", "zh-Hant",
- "sr-Latn-CS" , "sl-nedis", "sl-IT-nedis" , "sl-Latn-IT-nedis",
- "de-CH-x-Phonebk",
- "zh-cn-a-myExt-x-private",
- "x-foo",
- "x-kx-kx-kx",
- "i-whatever",
- "12345"} ;
-
- if ( args.length == 0 )
- args = tags ;
-
- for ( String str : args )
- {
- String[] parts = LangTag.parse(str) ;
- System.out.print("\""+str+"\"") ;
- boolean first =true ;
-
- if ( parts == null )
- {
- System.out.print(" ==> Illegal") ;
- }
- else
- {
- String canonical = canonical(parts) ;
- System.out.print(" ==> \""+canonical+"\"") ;
-
- System.out.print(" (") ;
- for ( String s : parts )
- {
- if ( ! first )
- System.out.print(", ") ;
- first = false ;
- if ( s == null )
- System.out.print("null") ;
- else
- System.out.print("\""+s+"\"") ;
- }
- System.out.print(")") ;
- }
- System.out.println() ;
- }
+ return ch1 + string ;
}
}
Modified: jena/trunk/jena-arq/src/test/java/org/apache/jena/riot/web/TestLangTag.java
URL: http://svn.apache.org/viewvc/jena/trunk/jena-arq/src/test/java/org/apache/jena/riot/web/TestLangTag.java?rev=1560173&r1=1560172&r2=1560173&view=diff
==============================================================================
--- jena/trunk/jena-arq/src/test/java/org/apache/jena/riot/web/TestLangTag.java (original)
+++ jena/trunk/jena-arq/src/test/java/org/apache/jena/riot/web/TestLangTag.java Tue Jan 21 20:33:06 2014
@@ -62,16 +62,16 @@ public class TestLangTag extends BaseTes
private static void parseGood(String input, String ex_output, String... ex_parts )
{
- String[] parts = LangTag.parse(input) ;
- assertArrayEquals(ex_parts, parts) ;
-
- String output = LangTag.canonical(input) ;
- assertEquals(ex_output, output) ;
-
- assertTrue(LangTag.check(input)) ;
+ String[] parts = LangTag.parse(input) ;
+ assertArrayEquals(ex_parts, parts) ;
+
+ String output = LangTag.canonical(input) ;
+ assertEquals(ex_output, output) ;
+
+ assertTrue(LangTag.check(input)) ;
}
-
+
private static void parseBad(String input)
{
String[] parts = LangTag.parse(input) ;
@@ -80,5 +80,41 @@ public class TestLangTag extends BaseTes
assertEquals(input, output) ;
assertFalse(LangTag.check(input)) ;
}
+
+ private void testCanonical(String input, String ex_output) {
+ String output = LangTag.canonical(input) ;
+ assertEquals(ex_output, output) ;
+ }
+ // "x" extensions and irregular forms are left alone, including "sgn-be-fr"
+
+ // Mentioned in BCP 47 tests
+// @Test public void parseCanonical_01() { testCanonical("en-ca-x-ca","en-CA-x-ca"); } // "x"
+// @Test public void parseCanonical_02() { testCanonical("EN-ca-X-Ca","en-CA-x-ca"); }
+// @Test public void parseCanonical_03() { testCanonical("En-Ca-X-Ca","en-CA-x-ca"); }
+// @Test public void parseCanonical_04() { testCanonical("SGN-BE-FR","sgn-BE-FR"); } // Irregular
+// @Test public void parseCanonical_05() { testCanonical("sgn-be-fr","sgn-BE-FR"); } // Irregular
+// @Test public void parseCanonical_06() { testCanonical("AZ-latn-x-LATN","az-Latn-x-latn"); }
+// @Test public void parseCanonical_07() { testCanonical("Az-latn-X-Latn","az-Latn-x-latn"); }
+
+ @Test public void parseCanonical_10() { testCanonical("zh-hant", "zh-Hant"); }
+ @Test public void parseCanonical_11() { testCanonical("zh-latn-wadegile", "zh-Latn-wadegile"); }
+ @Test public void parseCanonical_12() { testCanonical("zh-latn-pinyin", "zh-Latn-pinyin"); }
+ @Test public void parseCanonical_13() { testCanonical("en-us", "en-US"); }
+ @Test public void parseCanonical_14() { testCanonical("EN-Gb", "en-GB"); }
+ @Test public void parseCanonical_15() { testCanonical("qqq-002", "qqq-002"); }
+ @Test public void parseCanonical_16() { testCanonical("ja-latn", "ja-Latn"); }
+ @Test public void parseCanonical_17() { testCanonical("x-local", "x-local"); }
+ @Test public void parseCanonical_18() { testCanonical("he-latn", "he-Latn"); }
+ @Test public void parseCanonical_19() { testCanonical("und", "und"); }
+ @Test public void parseCanonical_20() { testCanonical("nn", "nn"); }
+ @Test public void parseCanonical_21() { testCanonical("ko-latn", "ko-Latn"); }
+ @Test public void parseCanonical_22() { testCanonical("ar-latn", "ar-Latn"); }
+ @Test public void parseCanonical_23() { testCanonical("la-x-liturgic", "la-x-liturgic"); }
+ @Test public void parseCanonical_24() { testCanonical("fa-x-middle", "fa-x-middle"); }
+ @Test public void parseCanonical_25() { testCanonical("qqq-142", "qqq-142"); }
+ @Test public void parseCanonical_26() { testCanonical("bnt", "bnt"); }
+ @Test public void parseCanonical_27() { testCanonical("grc-x-liturgic", "grc-x-liturgic"); }
+ @Test public void parseCanonical_28() { testCanonical("egy-Latn", "egy-Latn"); }
+ @Test public void parseCanonical_29() { testCanonical("la-x-medieval", "la-x-medieval"); }
}