Nitin Verma’s Blog

Archive for the ‘Java’ Category

Had done this long time ago for a legacy database, where UTF-8 char sequence was also overflowing 🙂 . May help someone.

RFC 3629

PJAM

$ java pjam.encoding.UTF7
I: ॐㄋㄧㄊㄧㄋॐ
E: rJPxLHKxLIGxLHJxLIGxLHKrJP
D: ॐㄋㄧㄊㄧㄋॐ
/*
 * UTF7.java
 *
 * Created on October 7, 2004, 4:53 PM
 */

package pjam.encoding;

import java.io.ByteArrayOutputStream;
import java.io.CharArrayWriter;
import java.io.ByteArrayInputStream;

/**
 * See RFC 3629,
 * this is a very similar transformation format of ISO 10646 in 7-bit.
 *
 * Implementation is for BMP support.
 * 

 * Char. number| UTF-7 heptade sequence
 * range       |
 * -------------------------------------------
 * 0000 - 003F | 0xxxxxx
 * 0040 - 01FF | 110xxxx 10xxxxx
 * 0200 - 1FFF | 1110xxx 10xxxxx 10xxxxx
 * 2000 - FFFF | 111100x 10xxxxx 10xxxxx 10xxxxx
 * 

 *
 * Can also support UTF-16 range (U+0000 to U+10FFFF), but would need
 * UTF-16 transformation (I think)
 * for char > U+FFFF as char size in java is 2 bytes only.
 * 

 * Char. number range | UTF-7 heptade sequence
 * -------------------------------------------
 * 000000 - 00003F    | 0xxxxxx
 * 000040 - 0001FF    | 110xxxx 10xxxxx
 * 000200 - 001FFF    | 1110xxx 10xxxxx 10xxxxx
 * 002000 - 01FFFF    | 11110xx 10xxxxx 10xxxxx 10xxxxx
 * 020000 - 10FFFF    | 1111100 10xxxxx 10xxxxx 10xxxxx 10xxxxx
 * 

 *
 * 

 *  final String string = "Nitin";
 *  final String utf7String = UTF7.encodeString(string);
 *  final String decodedString = UTF7.decodeString(utf7String)
 *  if ( !decodedString.equals(string)) {
 *      //Should never happen
 *  }
 * 

 *
 * This code is only to support 7-bit systems or charsets or streams, should not be used
 * to encode any Internet stream.
 *
 * @author  Nitin Verma
 * @version 1.0
 *
 */
public class UTF7 {

    public static void main ( final String args [] ) {
        // Nitin in Chinese Bopomofo between OM!
        final String string = "\u0950\u310B\u3127\u310A\u3127\u310B\u0950";
        System.out.println("I: " + string);
        final String utf7String = UTF7.encodeString(string);
        System.out.println("E: " + utf7String);
        final String decodedString = UTF7.decodeString(utf7String);
        System.out.println("D: " + decodedString);
        if ( !decodedString.equals(string)) {
             System.out.println( " Should never happen " );
        }
    }

    private UTF7() {
    }

    /** Gives a string that envelops UTF-7 bytes
     * @param bmpString A string only having chars in 'Basic Multilingual Plane'
     * @return UTF-7 String
     */
    public static String encodeString(final String bmpString) {
        char [] chars = new char[bmpString.length()];
        bmpString.getChars(0, bmpString.length(), chars, 0);
        return new String(encodeChars(chars));
    }

    /** Encodes BMP chars to UTF-7 byte sequence
     * @param bmpChars chars in 'Basic Multilingual Plane'
     * @return UTF-7 bytes
     */
    public static byte [] encodeChars(final char [] bmpChars) {
        ByteArrayOutputStream baos = new ByteArrayOutputStream(bmpChars.length);
        try {
            for ( int i = 0; i < bmpChars.length; i++ ) {
                baos.write(encodeChar(bmpChars&#91;i&#93;));
            }
        }
        catch (java.io.IOException ioe) {
            // will never happen
        }
        return baos.toByteArray();
    }

    /** Encodes BMP char to UTF-7 byte sequence
     * @param bmpChar char in 'Basic Multilingual Plane'
     * @return UTF-7 bytes
     */
    public static byte &#91;&#93; encodeChar(final char bmpChar) {
        if ( bmpChar >= 0x0 && bmpChar < = 0x3F ) {
             int b1 = (bmpChar & 0x3F);
             return new byte &#91;&#93; {(byte)b1};
        }
        else if ( bmpChar > 0x3F && bmpChar < = 0x1FF ) {
            int b1 = (bmpChar & 0x1F);
            b1 = 0x40 + b1;
            int b2=(bmpChar & 0x1FF) >> 5;
            b2 = 0x60 + b2;




            return new byte [] {(byte)b2, (byte)b1};
        }
        else if ( bmpChar > 0x1FF && bmpChar < = 0x1FFF ) {
            int b1 = (bmpChar & 0x1F);
            b1 = 0x40 + b1;
            int b2=(bmpChar & 0x3FF) >> 5;
            b2 = 0x40 + b2;
            int b3 = (bmpChar & 0x1c00) >> 10;
            b3 = 0x70 + b3;
            return new byte [] {(byte)b3, (byte)b2, (byte)b1};
        }
        else if ( bmpChar > 0x1FFF && bmpChar < = 0xFFFF ) {
            int b1 = (bmpChar & 0x1F);
            b1 = 0x40 + b1;
            int b2=(bmpChar & 0x3FF) >> 5;
            b2 = 0x40 + b2;
            int b3 = (bmpChar & 0x7c00) >> 10;
            b3 = 0x40 + b3;
            int b4 = (bmpChar & 0x8000) >> 15;
            b4 = 0x78 + b4;
            return new byte [] {(byte)b4, (byte)b3, (byte)b2, (byte)b1};
        }
        else {
            throw new RuntimeException("Only BMP charset support (U+0000 to U+FFFF), non-bmp char was " + bmpChar);
        }
    }

    /** Decodes UTF-7 encoded string
     * @param utfString A valid UTF-7 encoded string
     * @return Decoded string
     */
    public static String decodeString(final String utfString) {
        return new String(decodeUTFBytes(utfString.getBytes()));
    }

    /** Decodes UTF-7 byte sequence to BMP chars
     * @param utfBytes A valid UTF-7 encoded byte sequence
     * @return Decoded BMP chars
     */
    public static char [] decodeUTFBytes(final byte [] utfBytes) {
        ByteArrayInputStream bais = new ByteArrayInputStream(utfBytes);
        CharArrayWriter caw = new CharArrayWriter(utfBytes.length/4);
        int readByte = 0;
        while( (readByte = bais.read()) != -1 ) {

            if (readByte >= 0x0 && readByte < = 0x3F ) {
                caw.write((int)readByte);
                continue;
            }

            int sequenceByte = readByte;
            // Does it start with xxx0?
            if ( (sequenceByte & 0x10) == 0x0 ) {
                // Should start with x110
                if( (sequenceByte & 0x70) != 0x60 ) {
                    throw new RuntimeException("not UTF-7 byte sequence, bad sequence byte " + Integer.toBinaryString(sequenceByte));
                }
                int b1 = bais.read();
                checkTailByte(b1, new int &#91;&#93; {sequenceByte});
                caw.write(decodeUTF(new byte&#91;&#93; {(byte)readByte, (byte)b1}));
            }
            // Is it xxxx0xxx?
            else if ( (sequenceByte & 0x08) == 0x0 ) {
                // Should start with x1110
                if( (sequenceByte & 0x78) != 0x70 ) {
                    throw new RuntimeException("not UTF-7 byte sequence, bad sequence byte " + Integer.toBinaryString(sequenceByte));
                }
                int b1 = bais.read();
                checkTailByte(b1, new int &#91;&#93; {sequenceByte});
                int b2 = bais.read();
                checkTailByte(b2, new int &#91;&#93; {sequenceByte, b1});
                caw.write(decodeUTF(new byte&#91;&#93; {(byte)readByte, (byte)b1, (byte)b2}));
            }
            // Is it xxxxx00x?
            else if ( (sequenceByte & 0x06) == 0x0 ) {
                // Should start with x11110
                if( (sequenceByte & 0x7C) != 0x78 ) {
                    throw new RuntimeException("not UTF-7 byte sequence, bad sequence byte " + Integer.toBinaryString(sequenceByte));
                }
                int b1 = bais.read();
                checkTailByte(b1, new int &#91;&#93; {sequenceByte});
                int b2 = bais.read();
                checkTailByte(b2, new int &#91;&#93; {sequenceByte, b1});
                int b3 = bais.read();
                checkTailByte(b3, new int &#91;&#93; {sequenceByte, b1, b2});
                caw.write(decodeUTF(new byte&#91;&#93; {(byte)readByte, (byte)b1, (byte)b2, (byte)b3}));
            }
            else {
                throw new RuntimeException("not UTF-7 byte sequence");
            }
        }
        return caw.toCharArray();
    }

    private static void checkTailByte(int b, int &#91;&#93; priorBytes) {
        // Should start with x10x
        if( (b & 0x60) != 0x40 ) {
            throw new RuntimeException("not UTF-7 byte sequence, bad bytes sequence " + toBinaryString(priorBytes) + Integer.toBinaryString(b & 0x00ff));
        }
    }

    private static String toBinaryString(int &#91;&#93; bytes) {
        StringBuffer sb = new StringBuffer();
        for ( int i = 0; i < bytes.length; i++ ) {
            sb.append(Integer.toBinaryString(bytes&#91;i&#93; & 0x00ff));
            sb.append(" ");
        }
        return sb.toString();
    }

    /** Decodes UTF-7 byte sequence to BMP char.
     * @param utf A valid UTF-7 encoded byte sequence for a single BMP char
     * @return Decoded BMP char
     */
    public static char decodeUTF(final byte &#91;&#93; utf) {
        if ( utf.length == 1 ) {
            return (char) utf&#91;0&#93;;
        }
        else if ( utf.length == 2 ) {
            int o1 = (utf&#91;1&#93; & 0x001f);
            o1 = o1 << 3;
            int o2 = utf&#91;0&#93; & 0x000f;
            o2 = o2 << 8;
            int c1 = o2 + o1;
            c1 = c1 >> 3;
            return (char)c1;
        }
        else if ( utf.length == 3) {
            int o1 = (utf[2] & 0x001f);
            o1 = o1 < < 6;
            int o2 = utf&#91;1&#93; & 0x001f;
            o2 = o2 << 11;
            int o3 = utf&#91;0&#93; & 0x0007;
            o3 = o3 << 16;
            int c1 = o3 + o2 + o1;
            c1 = c1 >> 6;
            return (char)c1;
        }
        else if ( utf.length == 4) {
            int o1 = (utf[3] & 0x001f);
            o1 = o1 < < 1;
            int o2 = utf&#91;2&#93; & 0x001f;
            o2 = o2 << 6;
            int o3 = utf&#91;1&#93; & 0x001f;
            o3 = o3 << 11;
            int o4 = utf&#91;0&#93; & 0x0001;
            o4 = o4 << 16;
            int c1 = o4 + o3 + o2 + o1;
            c1 = c1 >> 1;
            return (char)c1;
        }
        else {
            throw new RuntimeException("not UTF-7 bytes");
        }
    }

}

Using jdk 1.6 the cost of blocking synchronization is nearly same as
non-blocking counter part. This does not say 1.6 makes it liveness
hazard free or deadlock free.

But makes it quite fast.

// We have three methods
// -1: Thread unsafe
    public static void addUnsafe() {
        b++;
    }
// 0: Thread safe, non-blocking synchronization
    public static void addNBSync() {
        try {
            lock.lock();
            b++;
        } finally {
            lock.unlock();
        }
    }
// 1: Thread safe, blocking synchronization
    public static synchronized void addSync() {
        b++;
    }

// Log format
//Use:<method>:<end value of b>:<time taken>
$ ~/usr/local/jdk1.5.0_17/bin/java -classpath classes1.5  edu.nitin.Main
Use:-1:63929500:156367000
Use:0:100000000:5360473000
Use:1:100000000:24290461000

$ echo "scale=10; ( 5360473000 - 156367000 ) / 156367000 " | bc
33.2813573196
$ echo "scale=10; ( 24290461000 - 156367000 ) / 156367000 " | bc
154.3426298387

$ ~/usr/local/jdk1.6.0_11/bin/java -classpath classes1.6  edu.nitin.Main
Use:-1:59050465:190538454
Use:0:100000000:5350310972
Use:1:100000000:6404364288

$ echo "scale=10; ( 5350310972 - 190538454 ) / 156367000 " | bc
32.9978353361
$ echo "scale=10; ( 6404364288 - 190538454 ) / 156367000 " | bc
39.7387289773

A test running 1000 threads doing 100,000 atomic operations each,

all the three ways.

...

package edu.nitin;

import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;

/**
 *
 * @author Nitin Verma
 */
public class Main {

    private final static Lock lock = new ReentrantLock();

    private static long b = 0;
    private final static long n = 100000;
    private final static int a = 1000;

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) throws InterruptedException {
        test(-1);
        test(0);
        test(1);
    }

    private static void test(final int use) throws InterruptedException {
        b = 0;
        final long start = System.nanoTime();
        long end = 0;
        try {
            final Thread[] ta = new Thread[a];
            for (int i = 0; i < a; i++) {                 ta[i] = new Thread(new MyRunnable(use));                 ta[i].start();             }             for (int i = 0; i < a; i++) {                 if (ta[i] != null && ta[i].isAlive()) {                     ta[i].join();                 }             }         } finally {             end = System.nanoTime();         }         System.out.println("Use:" + use + ":" + b + ":" + (end - start));     }     public static void addN(final long n, final int use) {         if (use == 0) {             for (long i = 0; i < n; i++) {                 addNBSync();             }         }         else if (use == 1)  {             for (long i = 0; i < n; i++) {                 addSync();             }         }         else {             for (long i = 0; i < n; i++) {                 addUnsafe();             }         }     }     public static void addNBSync() {         try {             lock.lock();             b++;         } finally {             lock.unlock();         }     }     public static synchronized void addSync() {         b++;     }     public static void addUnsafe() {         b++;     }     private static class MyRunnable implements Runnable {         private int use;         public MyRunnable(final int use) {             this.use = use;         }         public void run() {             addN(n, use);         }     } } [/sourcecode]