Nitin Verma’s Blog

Had done this long time ago for a legacy database, where UTF-8 char sequence was also overflowing 🙂 . May help someone.

RFC 3629

PJAM

$ java pjam.encoding.UTF7
I: ॐㄋㄧㄊㄧㄋॐ
E: rJPxLHKxLIGxLHJxLIGxLHKrJP
D: ॐㄋㄧㄊㄧㄋॐ
/*
 * UTF7.java
 *
 * Created on October 7, 2004, 4:53 PM
 */

package pjam.encoding;

import java.io.ByteArrayOutputStream;
import java.io.CharArrayWriter;
import java.io.ByteArrayInputStream;

/**
 * See RFC 3629,
 * this is a very similar transformation format of ISO 10646 in 7-bit.
 *
 * Implementation is for BMP support.
 * 

 * Char. number| UTF-7 heptade sequence
 * range       |
 * -------------------------------------------
 * 0000 - 003F | 0xxxxxx
 * 0040 - 01FF | 110xxxx 10xxxxx
 * 0200 - 1FFF | 1110xxx 10xxxxx 10xxxxx
 * 2000 - FFFF | 111100x 10xxxxx 10xxxxx 10xxxxx
 * 

 *
 * Can also support UTF-16 range (U+0000 to U+10FFFF), but would need
 * UTF-16 transformation (I think)
 * for char > U+FFFF as char size in java is 2 bytes only.
 * 

 * Char. number range | UTF-7 heptade sequence
 * -------------------------------------------
 * 000000 - 00003F    | 0xxxxxx
 * 000040 - 0001FF    | 110xxxx 10xxxxx
 * 000200 - 001FFF    | 1110xxx 10xxxxx 10xxxxx
 * 002000 - 01FFFF    | 11110xx 10xxxxx 10xxxxx 10xxxxx
 * 020000 - 10FFFF    | 1111100 10xxxxx 10xxxxx 10xxxxx 10xxxxx
 * 

 *
 * 

 *  final String string = "Nitin";
 *  final String utf7String = UTF7.encodeString(string);
 *  final String decodedString = UTF7.decodeString(utf7String)
 *  if ( !decodedString.equals(string)) {
 *      //Should never happen
 *  }
 * 

 *
 * This code is only to support 7-bit systems or charsets or streams, should not be used
 * to encode any Internet stream.
 *
 * @author  Nitin Verma
 * @version 1.0
 *
 */
public class UTF7 {

    public static void main ( final String args [] ) {
        // Nitin in Chinese Bopomofo between OM!
        final String string = "\u0950\u310B\u3127\u310A\u3127\u310B\u0950";
        System.out.println("I: " + string);
        final String utf7String = UTF7.encodeString(string);
        System.out.println("E: " + utf7String);
        final String decodedString = UTF7.decodeString(utf7String);
        System.out.println("D: " + decodedString);
        if ( !decodedString.equals(string)) {
             System.out.println( " Should never happen " );
        }
    }

    private UTF7() {
    }

    /** Gives a string that envelops UTF-7 bytes
     * @param bmpString A string only having chars in 'Basic Multilingual Plane'
     * @return UTF-7 String
     */
    public static String encodeString(final String bmpString) {
        char [] chars = new char[bmpString.length()];
        bmpString.getChars(0, bmpString.length(), chars, 0);
        return new String(encodeChars(chars));
    }

    /** Encodes BMP chars to UTF-7 byte sequence
     * @param bmpChars chars in 'Basic Multilingual Plane'
     * @return UTF-7 bytes
     */
    public static byte [] encodeChars(final char [] bmpChars) {
        ByteArrayOutputStream baos = new ByteArrayOutputStream(bmpChars.length);
        try {
            for ( int i = 0; i < bmpChars.length; i++ ) {
                baos.write(encodeChar(bmpChars&#91;i&#93;));
            }
        }
        catch (java.io.IOException ioe) {
            // will never happen
        }
        return baos.toByteArray();
    }

    /** Encodes BMP char to UTF-7 byte sequence
     * @param bmpChar char in 'Basic Multilingual Plane'
     * @return UTF-7 bytes
     */
    public static byte &#91;&#93; encodeChar(final char bmpChar) {
        if ( bmpChar >= 0x0 && bmpChar < = 0x3F ) {
             int b1 = (bmpChar & 0x3F);
             return new byte &#91;&#93; {(byte)b1};
        }
        else if ( bmpChar > 0x3F && bmpChar < = 0x1FF ) {
            int b1 = (bmpChar & 0x1F);
            b1 = 0x40 + b1;
            int b2=(bmpChar & 0x1FF) >> 5;
            b2 = 0x60 + b2;




            return new byte [] {(byte)b2, (byte)b1};
        }
        else if ( bmpChar > 0x1FF && bmpChar < = 0x1FFF ) {
            int b1 = (bmpChar & 0x1F);
            b1 = 0x40 + b1;
            int b2=(bmpChar & 0x3FF) >> 5;
            b2 = 0x40 + b2;
            int b3 = (bmpChar & 0x1c00) >> 10;
            b3 = 0x70 + b3;
            return new byte [] {(byte)b3, (byte)b2, (byte)b1};
        }
        else if ( bmpChar > 0x1FFF && bmpChar < = 0xFFFF ) {
            int b1 = (bmpChar & 0x1F);
            b1 = 0x40 + b1;
            int b2=(bmpChar & 0x3FF) >> 5;
            b2 = 0x40 + b2;
            int b3 = (bmpChar & 0x7c00) >> 10;
            b3 = 0x40 + b3;
            int b4 = (bmpChar & 0x8000) >> 15;
            b4 = 0x78 + b4;
            return new byte [] {(byte)b4, (byte)b3, (byte)b2, (byte)b1};
        }
        else {
            throw new RuntimeException("Only BMP charset support (U+0000 to U+FFFF), non-bmp char was " + bmpChar);
        }
    }

    /** Decodes UTF-7 encoded string
     * @param utfString A valid UTF-7 encoded string
     * @return Decoded string
     */
    public static String decodeString(final String utfString) {
        return new String(decodeUTFBytes(utfString.getBytes()));
    }

    /** Decodes UTF-7 byte sequence to BMP chars
     * @param utfBytes A valid UTF-7 encoded byte sequence
     * @return Decoded BMP chars
     */
    public static char [] decodeUTFBytes(final byte [] utfBytes) {
        ByteArrayInputStream bais = new ByteArrayInputStream(utfBytes);
        CharArrayWriter caw = new CharArrayWriter(utfBytes.length/4);
        int readByte = 0;
        while( (readByte = bais.read()) != -1 ) {

            if (readByte >= 0x0 && readByte < = 0x3F ) {
                caw.write((int)readByte);
                continue;
            }

            int sequenceByte = readByte;
            // Does it start with xxx0?
            if ( (sequenceByte & 0x10) == 0x0 ) {
                // Should start with x110
                if( (sequenceByte & 0x70) != 0x60 ) {
                    throw new RuntimeException("not UTF-7 byte sequence, bad sequence byte " + Integer.toBinaryString(sequenceByte));
                }
                int b1 = bais.read();
                checkTailByte(b1, new int &#91;&#93; {sequenceByte});
                caw.write(decodeUTF(new byte&#91;&#93; {(byte)readByte, (byte)b1}));
            }
            // Is it xxxx0xxx?
            else if ( (sequenceByte & 0x08) == 0x0 ) {
                // Should start with x1110
                if( (sequenceByte & 0x78) != 0x70 ) {
                    throw new RuntimeException("not UTF-7 byte sequence, bad sequence byte " + Integer.toBinaryString(sequenceByte));
                }
                int b1 = bais.read();
                checkTailByte(b1, new int &#91;&#93; {sequenceByte});
                int b2 = bais.read();
                checkTailByte(b2, new int &#91;&#93; {sequenceByte, b1});
                caw.write(decodeUTF(new byte&#91;&#93; {(byte)readByte, (byte)b1, (byte)b2}));
            }
            // Is it xxxxx00x?
            else if ( (sequenceByte & 0x06) == 0x0 ) {
                // Should start with x11110
                if( (sequenceByte & 0x7C) != 0x78 ) {
                    throw new RuntimeException("not UTF-7 byte sequence, bad sequence byte " + Integer.toBinaryString(sequenceByte));
                }
                int b1 = bais.read();
                checkTailByte(b1, new int &#91;&#93; {sequenceByte});
                int b2 = bais.read();
                checkTailByte(b2, new int &#91;&#93; {sequenceByte, b1});
                int b3 = bais.read();
                checkTailByte(b3, new int &#91;&#93; {sequenceByte, b1, b2});
                caw.write(decodeUTF(new byte&#91;&#93; {(byte)readByte, (byte)b1, (byte)b2, (byte)b3}));
            }
            else {
                throw new RuntimeException("not UTF-7 byte sequence");
            }
        }
        return caw.toCharArray();
    }

    private static void checkTailByte(int b, int &#91;&#93; priorBytes) {
        // Should start with x10x
        if( (b & 0x60) != 0x40 ) {
            throw new RuntimeException("not UTF-7 byte sequence, bad bytes sequence " + toBinaryString(priorBytes) + Integer.toBinaryString(b & 0x00ff));
        }
    }

    private static String toBinaryString(int &#91;&#93; bytes) {
        StringBuffer sb = new StringBuffer();
        for ( int i = 0; i < bytes.length; i++ ) {
            sb.append(Integer.toBinaryString(bytes&#91;i&#93; & 0x00ff));
            sb.append(" ");
        }
        return sb.toString();
    }

    /** Decodes UTF-7 byte sequence to BMP char.
     * @param utf A valid UTF-7 encoded byte sequence for a single BMP char
     * @return Decoded BMP char
     */
    public static char decodeUTF(final byte &#91;&#93; utf) {
        if ( utf.length == 1 ) {
            return (char) utf&#91;0&#93;;
        }
        else if ( utf.length == 2 ) {
            int o1 = (utf&#91;1&#93; & 0x001f);
            o1 = o1 << 3;
            int o2 = utf&#91;0&#93; & 0x000f;
            o2 = o2 << 8;
            int c1 = o2 + o1;
            c1 = c1 >> 3;
            return (char)c1;
        }
        else if ( utf.length == 3) {
            int o1 = (utf[2] & 0x001f);
            o1 = o1 < < 6;
            int o2 = utf&#91;1&#93; & 0x001f;
            o2 = o2 << 11;
            int o3 = utf&#91;0&#93; & 0x0007;
            o3 = o3 << 16;
            int c1 = o3 + o2 + o1;
            c1 = c1 >> 6;
            return (char)c1;
        }
        else if ( utf.length == 4) {
            int o1 = (utf[3] & 0x001f);
            o1 = o1 < < 1;
            int o2 = utf&#91;2&#93; & 0x001f;
            o2 = o2 << 6;
            int o3 = utf&#91;1&#93; & 0x001f;
            o3 = o3 << 11;
            int o4 = utf&#91;0&#93; & 0x0001;
            o4 = o4 << 16;
            int c1 = o4 + o3 + o2 + o1;
            c1 = c1 >> 1;
            return (char)c1;
        }
        else {
            throw new RuntimeException("not UTF-7 bytes");
        }
    }

}

Advertisements
// Refer to my previous post
// https://vermanitin.wordpress.com/2008/12/25/helloworld-flex/
$ ps -eafwww | grep hw | grep -v grep
nitinv   11703  6642  0 16:42 pts/7    00:00:00 ./hw

$ gcore -o hw.core 11703
(no debugging symbols found)
Using host libthread_db library "/lib/i686/nosegneg/libthread_db.so.1".
(no debugging symbols found)
(no debugging symbols found)
(no debugging symbols found)
0x00fac402 in __kernel_vsyscall ()
Saved corefile hw.core.11703

$ gdb ./hw hw.core.11703
GNU gdb Red Hat Linux (6.5-37.el5_2.2rh)
Copyright (C) 2006 Free Software Foundation, Inc.
GDB is free software, covered by the GNU General Public License, and you are
welcome to change it and/or distribute copies of it under certain conditions.
Type "show copying" to see the conditions.
There is absolutely no warranty for GDB.  Type "show warranty" for details.
This GDB was configured as "i386-redhat-linux-gnu"...(no debugging symbols found)
Using host libthread_db library "/lib/i686/nosegneg/libthread_db.so.1".


warning: Can't read pathname for load map: Input/output error.
Reading symbols from /lib/i686/nosegneg/libc.so.6...(no debugging symbols found)...done.
Loaded symbols for /lib/i686/nosegneg/libc.so.6
Reading symbols from /lib/ld-linux.so.2...(no debugging symbols found)...done.
Loaded symbols for /lib/ld-linux.so.2
(no debugging symbols found)
Core was generated by `/home/nitinv/codes/grammar/hw'.
#0  0x00fac402 in __kernel_vsyscall ()
(gdb) bt
#0  0x00fac402 in __kernel_vsyscall ()
#1  0x00b1d1d3 in __read_nocancel () from /lib/i686/nosegneg/libc.so.6
#2  0x00abd588 in _IO_file_read_internal () from /lib/i686/nosegneg/libc.so.6
#3  0x00abe930 in _IO_new_file_underflow () from /lib/i686/nosegneg/libc.so.6
#4  0x00abf02b in _IO_default_uflow_internal () from /lib/i686/nosegneg/libc.so.6
#5  0x00ac039d in __uflow () from /lib/i686/nosegneg/libc.so.6
#6  0x00aba10c in getc () from /lib/i686/nosegneg/libc.so.6
#7  0x08048fad in yy_get_next_buffer ()
#8  0x08048c84 in yylex ()
#9  0x08048655 in main ()

This even works with ulimit not set 😉

$ ulimit -a | grep core
core file size          (blocks, -c) 0

Neat way to stop and restart your process to take a dump. This can get handy sometime while getting a good snapshot.
Michael B. Trausch gcore: Obtain core dump of current running application — see comment

flex – fast lexical analyzer generator

// This is the flex version I have.
$ flex -V
flex version 2.5.4

// A Hello World lexical scanner.
$ cat -n hw.lex
     1  %{
     2  // Hello World!!!
     3  %}
     4  %%
     5
     6  [\t ]+  ;
     7
     8  a { printf( "' %s '", yytext); }
     9  hw { printf( "' -%s- '", yytext); }
    10  ^hw$ { printf( "' %s '", yytext); }
    11  ^exit$ { printf( "%s :have a nice day!!!\n", yytext); exit(0); }
    12
    13  %%
    14
    15  int yywrap()
    16  {
    17          return 1;
    18  }

// Generate your C code, with lex debug messages on. ( remove -d if you do not need those ).
$ flex -d -ohw.yy.c  hw.lex

$ more hw.yy.c
#line 2 "hw.yy.c"
/* A lexical scanner generated by flex*/

/* Scanner skeleton version:
 * $Header: /home/daffy/u0/vern/flex/RCS/flex.skl,v 2.91 96/09/10 16:58:48 vern Exp $
 */

#define FLEX_SCANNER
#define YY_FLEX_MAJOR_VERSION 2
#define YY_FLEX_MINOR_VERSION 5

#include <stdio .h>
#include <unistd .h>


/* cfront 1.2 defines "c_plusplus" instead of "__cplusplus" */
#ifdef c_plusplus
#ifndef __cplusplus
#define __cplusplus
#endif
#endif


#ifdef __cplusplus

#include <stdlib .h>

/* Use prototypes in function declarations. */
#define YY_USE_PROTOS

/* The "const" storage-class-modifier is valid. */
#define YY_USE_CONST

#else   /* ! __cplusplus */

#if __STDC__

#define YY_USE_PROTOS
#define YY_USE_CONST

#endif  /* __STDC__ */
#endif  /* ! __cplusplus */

#ifdef __TURBOC__
 #pragma warn -rch
 #pragma warn -use
#include <io .h>
#include <stdlib .h>
#define YY_USE_CONST
#define YY_USE_PROTOS
#endif
...

// Compile 

$ gcc -o hw -ll hw.yy.c

// Run your lexical scanner binary!!!

$ ./hw
--(end of buffer or a NUL)
hw
--accepting rule at line 10 ("hw
")
--accepting default rule ("
")
' hw '
--(end of buffer or a NUL)
hwhwa
--accepting rule at line 9 ("hw")
--accepting rule at line 9 ("hw")
--accepting rule at line 8 ("a")
--accepting default rule ("
")
' -hw- '' -hw- '' a '
--(end of buffer or a NUL)
hw hw a
--accepting rule at line 9 ("hw")
--accepting rule at line 6 (" ")
--accepting rule at line 9 ("hw")
--accepting rule at line 6 (" ")
--accepting rule at line 8 ("a")
--accepting default rule ("
")
' -hw- '' -hw- '' a '
--(end of buffer or a NUL)
exit
--accepting rule at line 11 ("exit
")
exit :have a nice day!!!


Using jdk 1.6 the cost of blocking synchronization is nearly same as
non-blocking counter part. This does not say 1.6 makes it liveness
hazard free or deadlock free.

But makes it quite fast.

// We have three methods
// -1: Thread unsafe
    public static void addUnsafe() {
        b++;
    }
// 0: Thread safe, non-blocking synchronization
    public static void addNBSync() {
        try {
            lock.lock();
            b++;
        } finally {
            lock.unlock();
        }
    }
// 1: Thread safe, blocking synchronization
    public static synchronized void addSync() {
        b++;
    }

// Log format
//Use:<method>:<end value of b>:<time taken>
$ ~/usr/local/jdk1.5.0_17/bin/java -classpath classes1.5  edu.nitin.Main
Use:-1:63929500:156367000
Use:0:100000000:5360473000
Use:1:100000000:24290461000

$ echo "scale=10; ( 5360473000 - 156367000 ) / 156367000 " | bc
33.2813573196
$ echo "scale=10; ( 24290461000 - 156367000 ) / 156367000 " | bc
154.3426298387

$ ~/usr/local/jdk1.6.0_11/bin/java -classpath classes1.6  edu.nitin.Main
Use:-1:59050465:190538454
Use:0:100000000:5350310972
Use:1:100000000:6404364288

$ echo "scale=10; ( 5350310972 - 190538454 ) / 156367000 " | bc
32.9978353361
$ echo "scale=10; ( 6404364288 - 190538454 ) / 156367000 " | bc
39.7387289773

A test running 1000 threads doing 100,000 atomic operations each,

all the three ways.

...

package edu.nitin;

import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;

/**
 *
 * @author Nitin Verma
 */
public class Main {

    private final static Lock lock = new ReentrantLock();

    private static long b = 0;
    private final static long n = 100000;
    private final static int a = 1000;

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) throws InterruptedException {
        test(-1);
        test(0);
        test(1);
    }

    private static void test(final int use) throws InterruptedException {
        b = 0;
        final long start = System.nanoTime();
        long end = 0;
        try {
            final Thread[] ta = new Thread[a];
            for (int i = 0; i < a; i++) {                 ta[i] = new Thread(new MyRunnable(use));                 ta[i].start();             }             for (int i = 0; i < a; i++) {                 if (ta[i] != null && ta[i].isAlive()) {                     ta[i].join();                 }             }         } finally {             end = System.nanoTime();         }         System.out.println("Use:" + use + ":" + b + ":" + (end - start));     }     public static void addN(final long n, final int use) {         if (use == 0) {             for (long i = 0; i < n; i++) {                 addNBSync();             }         }         else if (use == 1)  {             for (long i = 0; i < n; i++) {                 addSync();             }         }         else {             for (long i = 0; i < n; i++) {                 addUnsafe();             }         }     }     public static void addNBSync() {         try {             lock.lock();             b++;         } finally {             lock.unlock();         }     }     public static synchronized void addSync() {         b++;     }     public static void addUnsafe() {         b++;     }     private static class MyRunnable implements Runnable {         private int use;         public MyRunnable(final int use) {             this.use = use;         }         public void run() {             addN(n, use);         }     } } [/sourcecode]

Welcome to WordPress.com. This is your first post. Edit or delete it and start blogging!