You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@abdera.apache.org by jm...@apache.org on 2007/10/18 23:30:21 UTC
svn commit: r586147 -
/incubator/abdera/java/trunk/extensions/main/src/main/java/org/apache/abdera/ext/bidi/BidiHelper.java
Author: jmsnell
Date: Thu Oct 18 14:30:21 2007
New Revision: 586147
URL: http://svn.apache.org/viewvc?rev=586147&view=rev
Log:
Tests for the Bidi detection algorithm improvements
Modified:
incubator/abdera/java/trunk/extensions/main/src/main/java/org/apache/abdera/ext/bidi/BidiHelper.java
Modified: incubator/abdera/java/trunk/extensions/main/src/main/java/org/apache/abdera/ext/bidi/BidiHelper.java
URL: http://svn.apache.org/viewvc/incubator/abdera/java/trunk/extensions/main/src/main/java/org/apache/abdera/ext/bidi/BidiHelper.java?rev=586147&r1=586146&r2=586147&view=diff
==============================================================================
--- incubator/abdera/java/trunk/extensions/main/src/main/java/org/apache/abdera/ext/bidi/BidiHelper.java (original)
+++ incubator/abdera/java/trunk/extensions/main/src/main/java/org/apache/abdera/ext/bidi/BidiHelper.java Thu Oct 18 14:30:21 2007
@@ -19,14 +19,16 @@
import java.text.AttributedString;
import java.text.Bidi;
+import java.util.Arrays;
import java.util.Locale;
import javax.xml.namespace.QName;
-import org.apache.abdera.model.Base;
-import org.apache.abdera.model.Element;
import org.apache.abdera.i18n.io.CharUtils;
import org.apache.abdera.i18n.lang.Lang;
+import org.apache.abdera.model.Base;
+import org.apache.abdera.model.Document;
+import org.apache.abdera.model.Element;
/**
* <p>This is (hopefully) temporary. Ideally, this would be wrapped into the
@@ -181,6 +183,32 @@
return guessDirectionFromLanguage(element, false);
}
+ private static final String[] RTL_LANGS = {
+ "ar","fa","ur","ps","syr","dv","he","yi"};
+ private static final String[] RTL_SCRIPTS = {
+ "arab","avst","hebr","hung","lydi","mand",
+ "mani","mero","mong","nkoo","orkh","phlv",
+ "phnx","samr","syrc","syre","syrj","syrn",
+ "tfng","thaa"
+ };
+ // charset encodings that one may typically expect to be RTL
+ private static final String[] RTL_ENCODINGS = {
+ "iso-8859-6", "iso-8859-6-bidi",
+ "iso-8859-6-i", "iso-ir-127",
+ "ecma-114", "asmo-708", "arabic",
+ "csisolatinarabic", "windows-1256",
+ "ibm-864", "macarabic", "macfarsi",
+ "iso-8859-8-i", "iso-8859-8-bidi",
+ "windows-1255", "iso-8859-8", "ibm-862",
+ "machebrew", "asmo-449", "iso-9036",
+ "arabic7", "iso-ir-89", "csiso89asmo449",
+ "iso-unicode-ibm-1264", "csunicodeibm1264",
+ "iso_8859-8:1988", "iso-ir-138", "hebrew",
+ "csisolatinhebrew", "iso-unicode-ibm-1265",
+ "csunicodeibm1265", "cp862", "862",
+ "cspc862latinhebrew"
+ };
+
/**
* Attempt to guess the base direction using the in-scope language.
* Implements the method used by Internet Explorer 7's feed view
@@ -203,18 +231,44 @@
Locale l = Locale.getDefault();
lang = new Lang(l.getLanguage());
}
+ if (lang.getSubtagCount() > 0) {
+ String script = lang.getSubtag(0);
+ if (Arrays.binarySearch(RTL_SCRIPTS, script.toLowerCase()) > -1)
+ return Direction.RTL;
+ }
String primary = lang.getPrimary();
- return (primary.equalsIgnoreCase("ar") ||
- primary.equalsIgnoreCase("fa") ||
- primary.equalsIgnoreCase("ur") ||
- primary.equalsIgnoreCase("ps") ||
- primary.equalsIgnoreCase("syr") ||
- primary.equalsIgnoreCase("dv") ||
- primary.equalsIgnoreCase("he") ||
- primary.equalsIgnoreCase("yi")) ? Direction.RTL : Direction.LTR;
+ if (Arrays.binarySearch(RTL_LANGS, primary.toLowerCase()) > -1)
+ return Direction.RTL;
+ return Direction.UNSPECIFIED;
}
/**
+ * Attempt to guess the base direction using the charset encoding. This
+ * is a bit of a last resort approach
+ */
+ public static <T extends Element>Direction guessDirectionFromEncoding(T element) {
+ return guessDirectionFromEncoding(element,false);
+ }
+
+ /**
+ * Attempt to guess the base direction using the charset encoding. This
+ * is a bit of a last resort approach
+ */
+ @SuppressWarnings("unchecked")
+ public static <T extends Element>Direction guessDirectionFromEncoding(T element, boolean ignoredir) {
+ if (!ignoredir && hasDirection(element)) return getDirection(element);
+ Document doc = element.getDocument();
+ if (doc == null) return Direction.UNSPECIFIED;
+ String charset = doc.getCharset();
+ if (charset == null) return Direction.UNSPECIFIED;
+ charset = charset.replace('_', '-');
+ Arrays.sort(RTL_ENCODINGS);
+ if (Arrays.binarySearch(RTL_ENCODINGS, charset.toLowerCase()) > -1)
+ return Direction.RTL;
+ return Direction.UNSPECIFIED;
+ }
+
+ /**
* Attempt to guess the base direction of an element using an analysis of
* the directional properties of the characters used. This is a brute-force
* style approach that can achieve fairly reasonable results when the element
@@ -245,7 +299,9 @@
Direction dir = Direction.UNSPECIFIED;
if (!ignoredir && hasDirection(element)) return getDirection(element);
String text = element.getText();
- if (text != null) {
+ if (text != null && text.length() > 0) {
+ if (text.charAt(0) == 0x200F) return Direction.RTL; // if using the unicode right-to-left mark
+ if (text.charAt(0) == 0x200E) return Direction.LTR; // if using the unicode left-to-right mark
int c = 0;
for (int n = 0; n < text.length(); n++) {
char ch = text.charAt(n);