You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ni...@apache.org on 2011/09/23 13:54:12 UTC

svn commit: r1174680 - in /tika/trunk/tika-parsers/src: main/java/org/apache/tika/parser/txt/CharsetDetector.java main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java test/java/org/apache/tika/parser/txt/TXTParserTest.java

Author: nick
Date: Fri Sep 23 11:54:11 2011
New Revision: 1174680

URL: http://svn.apache.org/viewvc?rev=1174680&view=rev
Log:
TIKA-720 Add Charset Detector for the IBM500 (EBCDIC) charset

Modified:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java?rev=1174680&r1=1174679&r2=1174680&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java Fri Sep 23 11:54:11 2011
@@ -528,6 +528,13 @@ public class CharsetDetector {
         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
 
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_en());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_de());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_es());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_fr());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_it());
+        recognizers.add(new CharsetRecog_sbcs.CharsetRecog_EBCDIC_500_nl());
+        
         recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM866_ru());
 
         // Create an array of all charset names, as a side effect.

Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java?rev=1174680&r1=1174679&r2=1174680&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java (original)
+++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java Fri Sep 23 11:54:11 2011
@@ -1334,6 +1334,123 @@ abstract class CharsetRecog_sbcs extends
             matchFinish(det);
             return result;
         }
+    }
+    
+    static abstract class CharsetRecog_EBCDIC_500 extends CharsetRecog_sbcs
+    {
+        // This maps EBCDIC 500 codepoints onto either space (not of interest), or a lower
+        //  case ISO_8859_1 number/letter/accented-letter codepoint for ngram matching
+        // Because we map to ISO_8859_1, we can re-use the ngrams from those detectors
+        protected static byte[] byteMap = {
+/* 0x00-0x07 */ (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
+/* 0x08-0x0f */ (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
+/* 0x10-0x17 */ (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
+/* 0x18-0x1f */ (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
+/* 0x20-0x27 */ (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
+/* 0x28-0x2f */ (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
+/* 0x30-0x37 */ (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
+/* 0x38-0x3f */ (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
+/* 0x40-0x47 */ (byte)0x20, (byte)0x20, (byte)0xe2, (byte)0xe4, (byte)0xe0, (byte)0xe1, (byte)0xe3, (byte)0xe5,
+/* 0x48-0x4f */ (byte)0xe7, (byte)0xf1, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
+/* 0x50-0x57 */ (byte)0x20, (byte)0xe9, (byte)0xea, (byte)0xeb, (byte)0xe8, (byte)0xed, (byte)0xee, (byte)0xef,
+/* 0x58-0x5f */ (byte)0xec, (byte)0xdf, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
+/* 0x60-0x67 */ (byte)0x20, (byte)0x20, (byte)0xe2, (byte)0xe4, (byte)0xe0, (byte)0xe1, (byte)0xe3, (byte)0xe5,
+/* 0x68-0x6f */ (byte)0xe7, (byte)0xf1, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
+/* 0x70-0x77 */ (byte)0xf8, (byte)0xe9, (byte)0xea, (byte)0xeb, (byte)0xe8, (byte)0xed, (byte)0xee, (byte)0xef,
+/* 0x78-0x7f */ (byte)0xec, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
+/* 0x80-0x87 */ (byte)0xd8, (byte)'a',  (byte)'b',  (byte)'c',  (byte)'d',  (byte)'e',  (byte)'f',  (byte)'g',
+/* 0x88-0x8f */ (byte)'h',  (byte)'i',  (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
+/* 0x90-0x97 */ (byte)0x20, (byte)'j',  (byte)'k',  (byte)'l',  (byte)'m',  (byte)'n',  (byte)'o',  (byte)'p', 
+/* 0x98-0x9f */ (byte)'q',  (byte)'r',  (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
+/* 0xa0-0xa7 */ (byte)0x20, (byte)0x20, (byte)'s',  (byte)'t',  (byte)'u',  (byte)'v',  (byte)'w',  (byte)'x',
+/* 0xa8-0xaf */ (byte)'y',  (byte)'z',  (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
+/* 0xb0-0xb7 */ (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
+/* 0xb8-0xbf */ (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20, (byte)0x20,
+/* 0xc0-0xc7 */ (byte)0x20, (byte)'a',  (byte)'b',  (byte)'c',  (byte)'d',  (byte)'e',  (byte)'f',  (byte)'g',
+/* 0xc8-0xcf */ (byte)'h',  (byte)'i',  (byte)0x20, (byte)0xf4, (byte)0xf6, (byte)0xf2, (byte)0xf3, (byte)0xf5,
+/* 0xd0-0xd7 */ (byte)0x20, (byte)'j',  (byte)'k',  (byte)'l',  (byte)'m',  (byte)'n',  (byte)'o',  (byte)'p', 
+/* 0xd8-0xdf */ (byte)'q',  (byte)'r',  (byte)0x20, (byte)0xfb, (byte)0xfc, (byte)0xf9, (byte)0xfa, (byte)0xff,
+/* 0xe0-0xe7 */ (byte)0x20, (byte)0x20, (byte)'s',  (byte)'t',  (byte)'u',  (byte)'v',  (byte)'w',  (byte)'x',
+/* 0xe8-0xef */ (byte)'y',  (byte)'z',  (byte)0x20, (byte)0xf4, (byte)0xf6, (byte)0xf2, (byte)0xf3, (byte)0xf5,
+/* 0xf0-0xf7 */ (byte)'0',  (byte)'1',  (byte)'2',  (byte)'3',  (byte)'4',  (byte)'5',  (byte)'6',  (byte)'7', 
+/* 0xf8-0xff */ (byte)'8',  (byte)'9',  (byte)0x20, (byte)0xfb, (byte)0xfc, (byte)0xf9, (byte)0xfa, (byte)0x20,
+        };
         
+        public String getName()
+        {
+            return "IBM500";
+        }
+    }
+    
+    static class CharsetRecog_EBCDIC_500_en extends CharsetRecog_EBCDIC_500
+    {
+        public String getLanguage()
+        {
+            return "en";
+        }
+        public int match(CharsetDetector det)
+        {
+            return match(det, CharsetRecog_8859_1_en.ngrams, byteMap);
+        }
+    }
+    
+    static class CharsetRecog_EBCDIC_500_de extends CharsetRecog_EBCDIC_500
+    {
+        public String getLanguage()
+        {
+            return "de";
+        }
+        public int match(CharsetDetector det)
+        {
+            return match(det, CharsetRecog_8859_1_de.ngrams, byteMap);
+        }
+    }
+    
+    static class CharsetRecog_EBCDIC_500_fr extends CharsetRecog_EBCDIC_500
+    {
+        public String getLanguage()
+        {
+            return "fr";
+        }
+        public int match(CharsetDetector det)
+        {
+            return match(det, CharsetRecog_8859_1_fr.ngrams, byteMap);
+        }
+    }
+    
+    static class CharsetRecog_EBCDIC_500_es extends CharsetRecog_EBCDIC_500
+    {
+        public String getLanguage()
+        {
+            return "es";
+        }
+        public int match(CharsetDetector det)
+        {
+            return match(det, CharsetRecog_8859_1_es.ngrams, byteMap);
+        }
+    }
+    
+    static class CharsetRecog_EBCDIC_500_it extends CharsetRecog_EBCDIC_500
+    {
+        public String getLanguage()
+        {
+            return "it";
+        }
+        public int match(CharsetDetector det)
+        {
+            return match(det, CharsetRecog_8859_1_it.ngrams, byteMap);
+        }
+    }
+    
+    static class CharsetRecog_EBCDIC_500_nl extends CharsetRecog_EBCDIC_500
+    {
+        public String getLanguage()
+        {
+            return "nl";
+        }
+        public int match(CharsetDetector det)
+        {
+            return match(det, CharsetRecog_8859_1_nl.ngrams, byteMap);
+        }
     }
 }

Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java?rev=1174680&r1=1174679&r2=1174680&view=diff
==============================================================================
--- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java (original)
+++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java Fri Sep 23 11:54:11 2011
@@ -191,4 +191,17 @@ public class TXTParserTest extends TestC
         assertEquals("IBM866", metadata.get(Metadata.CONTENT_ENCODING));
     }
 
+    public void testEBCDIC_CP500() throws Exception {
+        Metadata metadata = new Metadata();
+        StringWriter writer = new StringWriter();
+        parser.parse(
+                TXTParserTest.class.getResourceAsStream("/test-documents/english.cp500.txt"),
+                new WriteOutContentHandler(writer),
+                metadata,
+                new ParseContext());
+
+        assertEquals("text/plain", metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("IBM500", metadata.get(Metadata.CONTENT_ENCODING));
+    }
+
 }