Nitin Verma’s Blog

Posts Tagged ‘i18n

Just open this rfc : RFC 3629

Open this table:

   Char. number range  |        UTF-8 octet sequence
      (hexadecimal)    |              (binary)
   --------------------+---------------------------------------------
   0000 0000-0000 007F | 0xxxxxxx
   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

Select a language with chars < U+10FFFF {It is possible to have chars above 10FFFF also if needed with time}– Unicode charts

Unicode character search

I just selected U+0950 ( U+0950 )

Now looking at the table it is in 0800-FFFF range

   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 

So just write ‘u0950’ in binary –>

  1. 0000 1001 0101 0000
  2. 0000 100101 010000 << make two group of 6 and one of 4
  3. 1110 0000 1010 0101 1001 0000 << now append group of 4 with 1110 and others with 10
  4. E0 A5 90 << in Hex
$ awk ' BEGIN { print "\xE0\xA5\x90" } '
เฅ

Now let us take
U+30F8 ใƒธ KATAKANA LETTER VI ( U+30F8)
Range :

   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 
  1. 0011 0000 1111 1000
  2. 0011 000011 111000
  3. 1110 0011 1000 0011 1011 1000
  4. E3 83 B8
$ awk ' BEGIN { print "\xE3\x83\xB8" } '
ใƒธ

next
U+10147 ๐…‡ GREEK ACROPHONIC ATTIC FIFTY THOUSAND (U+10147)
Range:

    0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 
  1. 0001 0000 0001 0100 0111
  2. 00 010000 000101 000111
  3. 1111 0000 1001 0000 1000 0101 1000 0111
  4. F0 90 85 87
$ awk ' BEGIN { print "\xF0\x90\x85\x87" } '
๐…‡

You may need to install ‘ttf-ancient-fonts’ to look at GREEK ACROPHONIC.

After doing this enjoy the following article.
No Excuses!

Advertisements

Had done this long time ago for a legacy database, where UTF-8 char sequence was also overflowing ๐Ÿ™‚ . May help someone.

RFC 3629

PJAM

$ java pjam.encoding.UTF7
I: เฅใ„‹ใ„งใ„Šใ„งใ„‹เฅ
E: rJPxLHKxLIGxLHJxLIGxLHKrJP
D: เฅใ„‹ใ„งใ„Šใ„งใ„‹เฅ
/*
 * UTF7.java
 *
 * Created on October 7, 2004, 4:53 PM
 */

package pjam.encoding;

import java.io.ByteArrayOutputStream;
import java.io.CharArrayWriter;
import java.io.ByteArrayInputStream;

/**
 * See RFC 3629,
 * this is a very similar transformation format of ISO 10646 in 7-bit.
 *
 * Implementation is for BMP support.
 * 

 * Char. number| UTF-7 heptade sequence
 * range       |
 * -------------------------------------------
 * 0000 - 003F | 0xxxxxx
 * 0040 - 01FF | 110xxxx 10xxxxx
 * 0200 - 1FFF | 1110xxx 10xxxxx 10xxxxx
 * 2000 - FFFF | 111100x 10xxxxx 10xxxxx 10xxxxx
 * 

 *
 * Can also support UTF-16 range (U+0000 to U+10FFFF), but would need
 * UTF-16 transformation (I think)
 * for char > U+FFFF as char size in java is 2 bytes only.
 * 

 * Char. number range | UTF-7 heptade sequence
 * -------------------------------------------
 * 000000 - 00003F    | 0xxxxxx
 * 000040 - 0001FF    | 110xxxx 10xxxxx
 * 000200 - 001FFF    | 1110xxx 10xxxxx 10xxxxx
 * 002000 - 01FFFF    | 11110xx 10xxxxx 10xxxxx 10xxxxx
 * 020000 - 10FFFF    | 1111100 10xxxxx 10xxxxx 10xxxxx 10xxxxx
 * 

 *
 * 

 *  final String string = "Nitin";
 *  final String utf7String = UTF7.encodeString(string);
 *  final String decodedString = UTF7.decodeString(utf7String)
 *  if ( !decodedString.equals(string)) {
 *      //Should never happen
 *  }
 * 

 *
 * This code is only to support 7-bit systems or charsets or streams, should not be used
 * to encode any Internet stream.
 *
 * @author  Nitin Verma
 * @version 1.0
 *
 */
public class UTF7 {

    public static void main ( final String args [] ) {
        // Nitin in Chinese Bopomofo between OM!
        final String string = "\u0950\u310B\u3127\u310A\u3127\u310B\u0950";
        System.out.println("I: " + string);
        final String utf7String = UTF7.encodeString(string);
        System.out.println("E: " + utf7String);
        final String decodedString = UTF7.decodeString(utf7String);
        System.out.println("D: " + decodedString);
        if ( !decodedString.equals(string)) {
             System.out.println( " Should never happen " );
        }
    }

    private UTF7() {
    }

    /** Gives a string that envelops UTF-7 bytes
     * @param bmpString A string only having chars in 'Basic Multilingual Plane'
     * @return UTF-7 String
     */
    public static String encodeString(final String bmpString) {
        char [] chars = new char[bmpString.length()];
        bmpString.getChars(0, bmpString.length(), chars, 0);
        return new String(encodeChars(chars));
    }

    /** Encodes BMP chars to UTF-7 byte sequence
     * @param bmpChars chars in 'Basic Multilingual Plane'
     * @return UTF-7 bytes
     */
    public static byte [] encodeChars(final char [] bmpChars) {
        ByteArrayOutputStream baos = new ByteArrayOutputStream(bmpChars.length);
        try {
            for ( int i = 0; i < bmpChars.length; i++ ) {
                baos.write(encodeChar(bmpChars&#91;i&#93;));
            }
        }
        catch (java.io.IOException ioe) {
            // will never happen
        }
        return baos.toByteArray();
    }

    /** Encodes BMP char to UTF-7 byte sequence
     * @param bmpChar char in 'Basic Multilingual Plane'
     * @return UTF-7 bytes
     */
    public static byte &#91;&#93; encodeChar(final char bmpChar) {
        if ( bmpChar >= 0x0 && bmpChar < = 0x3F ) {
             int b1 = (bmpChar & 0x3F);
             return new byte &#91;&#93; {(byte)b1};
        }
        else if ( bmpChar > 0x3F && bmpChar < = 0x1FF ) {
            int b1 = (bmpChar & 0x1F);
            b1 = 0x40 + b1;
            int b2=(bmpChar & 0x1FF) >> 5;
            b2 = 0x60 + b2;




            return new byte [] {(byte)b2, (byte)b1};
        }
        else if ( bmpChar > 0x1FF && bmpChar < = 0x1FFF ) {
            int b1 = (bmpChar & 0x1F);
            b1 = 0x40 + b1;
            int b2=(bmpChar & 0x3FF) >> 5;
            b2 = 0x40 + b2;
            int b3 = (bmpChar & 0x1c00) >> 10;
            b3 = 0x70 + b3;
            return new byte [] {(byte)b3, (byte)b2, (byte)b1};
        }
        else if ( bmpChar > 0x1FFF && bmpChar < = 0xFFFF ) {
            int b1 = (bmpChar & 0x1F);
            b1 = 0x40 + b1;
            int b2=(bmpChar & 0x3FF) >> 5;
            b2 = 0x40 + b2;
            int b3 = (bmpChar & 0x7c00) >> 10;
            b3 = 0x40 + b3;
            int b4 = (bmpChar & 0x8000) >> 15;
            b4 = 0x78 + b4;
            return new byte [] {(byte)b4, (byte)b3, (byte)b2, (byte)b1};
        }
        else {
            throw new RuntimeException("Only BMP charset support (U+0000 to U+FFFF), non-bmp char was " + bmpChar);
        }
    }

    /** Decodes UTF-7 encoded string
     * @param utfString A valid UTF-7 encoded string
     * @return Decoded string
     */
    public static String decodeString(final String utfString) {
        return new String(decodeUTFBytes(utfString.getBytes()));
    }

    /** Decodes UTF-7 byte sequence to BMP chars
     * @param utfBytes A valid UTF-7 encoded byte sequence
     * @return Decoded BMP chars
     */
    public static char [] decodeUTFBytes(final byte [] utfBytes) {
        ByteArrayInputStream bais = new ByteArrayInputStream(utfBytes);
        CharArrayWriter caw = new CharArrayWriter(utfBytes.length/4);
        int readByte = 0;
        while( (readByte = bais.read()) != -1 ) {

            if (readByte >= 0x0 && readByte < = 0x3F ) {
                caw.write((int)readByte);
                continue;
            }

            int sequenceByte = readByte;
            // Does it start with xxx0?
            if ( (sequenceByte & 0x10) == 0x0 ) {
                // Should start with x110
                if( (sequenceByte & 0x70) != 0x60 ) {
                    throw new RuntimeException("not UTF-7 byte sequence, bad sequence byte " + Integer.toBinaryString(sequenceByte));
                }
                int b1 = bais.read();
                checkTailByte(b1, new int &#91;&#93; {sequenceByte});
                caw.write(decodeUTF(new byte&#91;&#93; {(byte)readByte, (byte)b1}));
            }
            // Is it xxxx0xxx?
            else if ( (sequenceByte & 0x08) == 0x0 ) {
                // Should start with x1110
                if( (sequenceByte & 0x78) != 0x70 ) {
                    throw new RuntimeException("not UTF-7 byte sequence, bad sequence byte " + Integer.toBinaryString(sequenceByte));
                }
                int b1 = bais.read();
                checkTailByte(b1, new int &#91;&#93; {sequenceByte});
                int b2 = bais.read();
                checkTailByte(b2, new int &#91;&#93; {sequenceByte, b1});
                caw.write(decodeUTF(new byte&#91;&#93; {(byte)readByte, (byte)b1, (byte)b2}));
            }
            // Is it xxxxx00x?
            else if ( (sequenceByte & 0x06) == 0x0 ) {
                // Should start with x11110
                if( (sequenceByte & 0x7C) != 0x78 ) {
                    throw new RuntimeException("not UTF-7 byte sequence, bad sequence byte " + Integer.toBinaryString(sequenceByte));
                }
                int b1 = bais.read();
                checkTailByte(b1, new int &#91;&#93; {sequenceByte});
                int b2 = bais.read();
                checkTailByte(b2, new int &#91;&#93; {sequenceByte, b1});
                int b3 = bais.read();
                checkTailByte(b3, new int &#91;&#93; {sequenceByte, b1, b2});
                caw.write(decodeUTF(new byte&#91;&#93; {(byte)readByte, (byte)b1, (byte)b2, (byte)b3}));
            }
            else {
                throw new RuntimeException("not UTF-7 byte sequence");
            }
        }
        return caw.toCharArray();
    }

    private static void checkTailByte(int b, int &#91;&#93; priorBytes) {
        // Should start with x10x
        if( (b & 0x60) != 0x40 ) {
            throw new RuntimeException("not UTF-7 byte sequence, bad bytes sequence " + toBinaryString(priorBytes) + Integer.toBinaryString(b & 0x00ff));
        }
    }

    private static String toBinaryString(int &#91;&#93; bytes) {
        StringBuffer sb = new StringBuffer();
        for ( int i = 0; i < bytes.length; i++ ) {
            sb.append(Integer.toBinaryString(bytes&#91;i&#93; & 0x00ff));
            sb.append(" ");
        }
        return sb.toString();
    }

    /** Decodes UTF-7 byte sequence to BMP char.
     * @param utf A valid UTF-7 encoded byte sequence for a single BMP char
     * @return Decoded BMP char
     */
    public static char decodeUTF(final byte &#91;&#93; utf) {
        if ( utf.length == 1 ) {
            return (char) utf&#91;0&#93;;
        }
        else if ( utf.length == 2 ) {
            int o1 = (utf&#91;1&#93; & 0x001f);
            o1 = o1 << 3;
            int o2 = utf&#91;0&#93; & 0x000f;
            o2 = o2 << 8;
            int c1 = o2 + o1;
            c1 = c1 >> 3;
            return (char)c1;
        }
        else if ( utf.length == 3) {
            int o1 = (utf[2] & 0x001f);
            o1 = o1 < < 6;
            int o2 = utf&#91;1&#93; & 0x001f;
            o2 = o2 << 11;
            int o3 = utf&#91;0&#93; & 0x0007;
            o3 = o3 << 16;
            int c1 = o3 + o2 + o1;
            c1 = c1 >> 6;
            return (char)c1;
        }
        else if ( utf.length == 4) {
            int o1 = (utf[3] & 0x001f);
            o1 = o1 < < 1;
            int o2 = utf&#91;2&#93; & 0x001f;
            o2 = o2 << 6;
            int o3 = utf&#91;1&#93; & 0x001f;
            o3 = o3 << 11;
            int o4 = utf&#91;0&#93; & 0x0001;
            o4 = o4 << 16;
            int c1 = o4 + o3 + o2 + o1;
            c1 = c1 >> 1;
            return (char)c1;
        }
        else {
            throw new RuntimeException("not UTF-7 bytes");
        }
    }

}