Posted by: Neo on: January 1, 2009
Had done this long time ago for a legacy database, where UTF-8 char sequence was also overflowing
. May help someone.
$ java pjam.encoding.UTF7 I: ॐㄋㄧㄊㄧㄋॐ E: rJPxLHKxLIGxLHJxLIGxLHKrJP D: ॐㄋㄧㄊㄧㄋॐ
/*
* UTF7.java
*
* Created on October 7, 2004, 4:53 PM
*/
package pjam.encoding;
import java.io.ByteArrayOutputStream;
import java.io.CharArrayWriter;
import java.io.ByteArrayInputStream;
/**
* See RFC 3629,
* this is a very similar transformation format of ISO 10646 in 7-bit.
*
* Implementation is for BMP support.
*
* Char. number| UTF-7 heptade sequence
* range |
* -------------------------------------------
* 0000 - 003F | 0xxxxxx
* 0040 - 01FF | 110xxxx 10xxxxx
* 0200 - 1FFF | 1110xxx 10xxxxx 10xxxxx
* 2000 - FFFF | 111100x 10xxxxx 10xxxxx 10xxxxx
*
*
* Can also support UTF-16 range (U+0000 to U+10FFFF), but would need
* UTF-16 transformation (I think)
* for char > U+FFFF as char size in java is 2 bytes only.
*
* Char. number range | UTF-7 heptade sequence
* -------------------------------------------
* 000000 - 00003F | 0xxxxxx
* 000040 - 0001FF | 110xxxx 10xxxxx
* 000200 - 001FFF | 1110xxx 10xxxxx 10xxxxx
* 002000 - 01FFFF | 11110xx 10xxxxx 10xxxxx 10xxxxx
* 020000 - 10FFFF | 1111100 10xxxxx 10xxxxx 10xxxxx 10xxxxx
*
*
*
* final String string = "Nitin";
* final String utf7String = UTF7.encodeString(string);
* final String decodedString = UTF7.decodeString(utf7String)
* if ( !decodedString.equals(string)) {
* //Should never happen
* }
*
*
* This code is only to support 7-bit systems or charsets or streams, should not be used
* to encode any Internet stream.
*
* @author Nitin Verma
* @version 1.0
*
*/
public class UTF7 {
public static void main ( final String args [] ) {
// Nitin in Chinese Bopomofo between OM!
final String string = "\u0950\u310B\u3127\u310A\u3127\u310B\u0950";
System.out.println("I: " + string);
final String utf7String = UTF7.encodeString(string);
System.out.println("E: " + utf7String);
final String decodedString = UTF7.decodeString(utf7String);
System.out.println("D: " + decodedString);
if ( !decodedString.equals(string)) {
System.out.println( " Should never happen " );
}
}
private UTF7() {
}
/** Gives a string that envelops UTF-7 bytes
* @param bmpString A string only having chars in 'Basic Multilingual Plane'
* @return UTF-7 String
*/
public static String encodeString(final String bmpString) {
char [] chars = new char[bmpString.length()];
bmpString.getChars(0, bmpString.length(), chars, 0);
return new String(encodeChars(chars));
}
/** Encodes BMP chars to UTF-7 byte sequence
* @param bmpChars chars in 'Basic Multilingual Plane'
* @return UTF-7 bytes
*/
public static byte [] encodeChars(final char [] bmpChars) {
ByteArrayOutputStream baos = new ByteArrayOutputStream(bmpChars.length);
try {
for ( int i = 0; i < bmpChars.length; i++ ) {
baos.write(encodeChar(bmpChars[i]));
}
}
catch (java.io.IOException ioe) {
// will never happen
}
return baos.toByteArray();
}
/** Encodes BMP char to UTF-7 byte sequence
* @param bmpChar char in 'Basic Multilingual Plane'
* @return UTF-7 bytes
*/
public static byte [] encodeChar(final char bmpChar) {
if ( bmpChar >= 0x0 && bmpChar < = 0x3F ) {
int b1 = (bmpChar & 0x3F);
return new byte [] {(byte)b1};
}
else if ( bmpChar > 0x3F && bmpChar < = 0x1FF ) {
int b1 = (bmpChar & 0x1F);
b1 = 0x40 + b1;
int b2=(bmpChar & 0x1FF) >> 5;
b2 = 0x60 + b2;
return new byte [] {(byte)b2, (byte)b1};
}
else if ( bmpChar > 0x1FF && bmpChar < = 0x1FFF ) {
int b1 = (bmpChar & 0x1F);
b1 = 0x40 + b1;
int b2=(bmpChar & 0x3FF) >> 5;
b2 = 0x40 + b2;
int b3 = (bmpChar & 0x1c00) >> 10;
b3 = 0x70 + b3;
return new byte [] {(byte)b3, (byte)b2, (byte)b1};
}
else if ( bmpChar > 0x1FFF && bmpChar < = 0xFFFF ) {
int b1 = (bmpChar & 0x1F);
b1 = 0x40 + b1;
int b2=(bmpChar & 0x3FF) >> 5;
b2 = 0x40 + b2;
int b3 = (bmpChar & 0x7c00) >> 10;
b3 = 0x40 + b3;
int b4 = (bmpChar & 0x8000) >> 15;
b4 = 0x78 + b4;
return new byte [] {(byte)b4, (byte)b3, (byte)b2, (byte)b1};
}
else {
throw new RuntimeException("Only BMP charset support (U+0000 to U+FFFF), non-bmp char was " + bmpChar);
}
}
/** Decodes UTF-7 encoded string
* @param utfString A valid UTF-7 encoded string
* @return Decoded string
*/
public static String decodeString(final String utfString) {
return new String(decodeUTFBytes(utfString.getBytes()));
}
/** Decodes UTF-7 byte sequence to BMP chars
* @param utfBytes A valid UTF-7 encoded byte sequence
* @return Decoded BMP chars
*/
public static char [] decodeUTFBytes(final byte [] utfBytes) {
ByteArrayInputStream bais = new ByteArrayInputStream(utfBytes);
CharArrayWriter caw = new CharArrayWriter(utfBytes.length/4);
int readByte = 0;
while( (readByte = bais.read()) != -1 ) {
if (readByte >= 0x0 && readByte < = 0x3F ) {
caw.write((int)readByte);
continue;
}
int sequenceByte = readByte;
// Does it start with xxx0?
if ( (sequenceByte & 0x10) == 0x0 ) {
// Should start with x110
if( (sequenceByte & 0x70) != 0x60 ) {
throw new RuntimeException("not UTF-7 byte sequence, bad sequence byte " + Integer.toBinaryString(sequenceByte));
}
int b1 = bais.read();
checkTailByte(b1, new int [] {sequenceByte});
caw.write(decodeUTF(new byte[] {(byte)readByte, (byte)b1}));
}
// Is it xxxx0xxx?
else if ( (sequenceByte & 0x08) == 0x0 ) {
// Should start with x1110
if( (sequenceByte & 0x78) != 0x70 ) {
throw new RuntimeException("not UTF-7 byte sequence, bad sequence byte " + Integer.toBinaryString(sequenceByte));
}
int b1 = bais.read();
checkTailByte(b1, new int [] {sequenceByte});
int b2 = bais.read();
checkTailByte(b2, new int [] {sequenceByte, b1});
caw.write(decodeUTF(new byte[] {(byte)readByte, (byte)b1, (byte)b2}));
}
// Is it xxxxx00x?
else if ( (sequenceByte & 0x06) == 0x0 ) {
// Should start with x11110
if( (sequenceByte & 0x7C) != 0x78 ) {
throw new RuntimeException("not UTF-7 byte sequence, bad sequence byte " + Integer.toBinaryString(sequenceByte));
}
int b1 = bais.read();
checkTailByte(b1, new int [] {sequenceByte});
int b2 = bais.read();
checkTailByte(b2, new int [] {sequenceByte, b1});
int b3 = bais.read();
checkTailByte(b3, new int [] {sequenceByte, b1, b2});
caw.write(decodeUTF(new byte[] {(byte)readByte, (byte)b1, (byte)b2, (byte)b3}));
}
else {
throw new RuntimeException("not UTF-7 byte sequence");
}
}
return caw.toCharArray();
}
private static void checkTailByte(int b, int [] priorBytes) {
// Should start with x10x
if( (b & 0x60) != 0x40 ) {
throw new RuntimeException("not UTF-7 byte sequence, bad bytes sequence " + toBinaryString(priorBytes) + Integer.toBinaryString(b & 0x00ff));
}
}
private static String toBinaryString(int [] bytes) {
StringBuffer sb = new StringBuffer();
for ( int i = 0; i < bytes.length; i++ ) {
sb.append(Integer.toBinaryString(bytes[i] & 0x00ff));
sb.append(" ");
}
return sb.toString();
}
/** Decodes UTF-7 byte sequence to BMP char.
* @param utf A valid UTF-7 encoded byte sequence for a single BMP char
* @return Decoded BMP char
*/
public static char decodeUTF(final byte [] utf) {
if ( utf.length == 1 ) {
return (char) utf[0];
}
else if ( utf.length == 2 ) {
int o1 = (utf[1] & 0x001f);
o1 = o1 << 3;
int o2 = utf[0] & 0x000f;
o2 = o2 << 8;
int c1 = o2 + o1;
c1 = c1 >> 3;
return (char)c1;
}
else if ( utf.length == 3) {
int o1 = (utf[2] & 0x001f);
o1 = o1 < < 6;
int o2 = utf[1] & 0x001f;
o2 = o2 << 11;
int o3 = utf[0] & 0x0007;
o3 = o3 << 16;
int c1 = o3 + o2 + o1;
c1 = c1 >> 6;
return (char)c1;
}
else if ( utf.length == 4) {
int o1 = (utf[3] & 0x001f);
o1 = o1 < < 1;
int o2 = utf[2] & 0x001f;
o2 = o2 << 6;
int o3 = utf[1] & 0x001f;
o3 = o3 << 11;
int o4 = utf[0] & 0x0001;
o4 = o4 << 16;
int c1 = o4 + o3 + o2 + o1;
c1 = c1 >> 1;
return (char)c1;
}
else {
throw new RuntimeException("not UTF-7 bytes");
}
}
}
[...] UTF-7 heptade sequence – Encode Devanagari (ॐ) or Bopomofo (ㄋㄧㄊㄧㄋ) [...]
January 1, 2009 at 8:11 pm
Bro . . .a little request . . .all looks awesome but can u explain a little and let people know how to exactly use it . . .anyone at my level of java (which is pretty much ZERO) will be confused ! ! ! how to use all that ? ? I am Marketing / PR / Sales guy just know simple stuff nothing complicated
January 2, 2009 at 7:14 am
Read this blog may help http://vermanitin.wordpress.com/2009/01/01/utf-8-using-plain-hands/ else I’ll call you bro! Better be on a linux box.