/*
    Reference Implementation of null and identity canonicalizations for
    the XML Signature work group.

    Author of original work: Ed Simon <ed.simon@entrust.com>

    *******************************************************************
    Last modified on 1999 September 22 by Ed Simon
*/

/*
    To compile, enter "javac Canonicalizer.java".
    To run the self-test, enter "java Canonicalizer".

    Note: Originally developed with JDK 1.2.2 on Windows.
*/

import java.io.*; // for UnsupportedEncodingException class

import java.lang.*;
import java.text.*;


public class Canonicalizer {
    
    public static byte[] doNull(byte[] inputBytes) {
        /*
            The null canonicalization does no canonicalization.
        */
        return inputBytes;
    }
    
    
    public static byte[] doIdentity(byte[] inputBytes, String inputEncoding) {
        /*
            Perform the identity canonicalization on a sequence of
            input bytes.
    
            Identity canonicalization involves
            
                * converting from the input encoding to UTF-8
    
                * normalizing the new lines to UTF-8 0x0A

            First, we need to convert the input bytes into bytes
            representing a sequence of UTF-8 characters.
        
            Though one could write code to go directly from the
            non-UTF-8 input encoding to the UTF-8 output encoding,
            for ease and brevity, the technique described in
            "http://java.sun.com/docs/books/tutorial/i18n/text/string.html"
            will be used here.
        */
        
        String inputAsUnicodeString;
        
        if (inputEncoding.equals("")) {
                inputAsUnicodeString = new String(inputBytes);            
        } else {
            try {
                // First, convert the input bytes into a Java (Unicode) string.
                // The String() constructor that takes an argument for specifying
                // the encoding of the input bytes is used.  The result is a
                // Java String object which, by definition, has Unicode encoding.
                inputAsUnicodeString = new String(inputBytes, inputEncoding);
                
            } catch (UnsupportedEncodingException exception) {        
                System.out.println("Encoding " + inputEncoding + " not supported.");
                exception.printStackTrace();
                return null;
            }
        }
        
        /*
            Rather than converting to bytes at this point, we'll take advantage
            of Java's StringCharacterIterator class to normalize the line breaks.
        */
    
        StringBuffer stringWithNormalizedLineBreaks =
                new StringBuffer(inputAsUnicodeString.length());
        
        StringCharacterIterator stringCharacterIterator =
                new StringCharacterIterator(inputAsUnicodeString);

        for(char character = stringCharacterIterator.first();
            character != CharacterIterator.DONE;
            character = stringCharacterIterator.next()) {
                
            if (character == '\r') {  // replace it with a '\n' character
                stringWithNormalizedLineBreaks.append('\n');                
                
                // If the index is not at the end,
                // see if the next character is a '\n'.
                // If it is, get the next character and
                // do nothing with it (ie. just increment the
                // StringCharacterIterator).  Otherwise, append
                // it to the newline-normalized string.
                character = stringCharacterIterator.next();
                
                // If the next character is a \n, do nothing.
                // Otherwise, back up the index.
                if    (character != '\n'){
                    int index = stringCharacterIterator.getIndex();
                    stringCharacterIterator.setIndex(index - 1);
                }
                
            } else {  // just copy the character
                stringWithNormalizedLineBreaks.append(character);
            }
                 
        }

        
        // Now that the line breaks have been normalized, we want the sequence
        // of output bytes representing a UTF-8
        // string so we call getBytes() specifying the output encoding as
        // "UTF-8".  (Java uses "UTF8" as the label for "UTF-8", see
        // "http://java.sun.com/products/jdk/1.1/docs/guide/intl/encoding.doc.html"
        // for details.)
        /*
			Note: Check out the
            superb encoding conversion applet at
            "http://www.macchiato.com/mark/UnicodeConverter/"
            for a more complete list.
        */

        byte[] utf8OutputBytes;
        
        try {        
            utf8OutputBytes = stringWithNormalizedLineBreaks.toString().getBytes("UTF8");
        } catch (UnsupportedEncodingException exception) {
            System.out.println("Encoding not supported.");
            exception.printStackTrace();
            return null;
        }
        
        return utf8OutputBytes;
    }
    
    
    
    public static void main(String args[]) {

        /*
            main () provides an easy way showing the results of calling the above
            methods.
        */
        
        String simpleString = new String("Hello World!");
        
        byte[] bytes_simpleString = simpleString.getBytes();

        byte[] bytes_utf8Output0 = {    
                (byte) 0x48, (byte) 0x65, (byte) 0x6c, (byte) 0x6c,
                (byte) 0x6f, (byte) 0x20, (byte) 0x57, (byte) 0x6f,
                (byte) 0x72, (byte) 0x6c, (byte) 0x64, (byte) 0x21 };        
        
        test(bytes_simpleString, "8859_1", bytes_utf8Output0);
        
        /*
            For more interesting test data, we use the examples in section 4 of
            RFC 2279: "UTF-8, a transformation format of ISO 10646"
            (see "http://info.internet.isi.edu/in-notes/rfc/files/rfc2279.txt").
        
            From section 4 of RFC 2279...
        
                4.  Examples

                   The UCS-2 sequence "A<NOT IDENTICAL TO><ALPHA>." (0041, 2262, 0391,
                   002E) may be encoded in UTF-8 as follows:

                   41 E2 89 A2 CE 91 2E

                   The UCS-2 sequence representing the Hangul characters for the Korean
                   word "hangugo" (D55C, AD6D, C5B4) may be encoded as follows:

                   ED 95 9C EA B5 AD EC 96 B4

                   The UCS-2 sequence representing the Han characters for the Japanese
                   word "nihongo" (65E5, 672C, 8A9E) may be encoded as follows:

                   E6 97 A5 E6 9C AC E8 AA 9E
        
            We also include some "\r", and "\r\n" strings to test the line break
            canonicalization code.
        
        */        
        /*
            These tests wouldn't be commented out if I could figure out how
            to use java to convert from UCS-2 to UTF-8!  Can anyone help me out here?

        byte[] bytes_AnotAlpha = {    (byte) 0x00, (byte) 0x41, (byte) 0x22, (byte) 0x62,
                                    (byte) 0x03, (byte) 0x91, (byte) 0x00, (byte) 0x2e };        
                                  
        byte[] bytes_utf8Output1 = {    (byte) 0x41, (byte) 0xe2, (byte) 0x89, (byte) 0xa2,
                                        (byte) 0xce, (byte) 0x91, (byte) 0x2e };        
        
        test(bytes_AnotAlpha, "???", bytes_utf8Output1);
        
        byte[] bytes_hangugo = {    (byte) 0xd5, (byte) 0x5c, (byte) 0xad, (byte) 0x6d,
                                    (byte) 0xc5, (byte) 0xb4 };
        
        byte[] bytes_utf8Output2 = {    (byte) 0xed, (byte) 0x95, (byte) 0x9c, (byte) 0xea,
                                        (byte) 0xb5, (byte) 0xad, (byte) 0xec, (byte) 0x96,
                                        (byte) 0xb4 };        
        
        test(bytes_hangugo, "???", bytes_utf8Output2);
        
        
        byte[] bytes_nihongo = {    (byte) 0x65, (byte) 0xe5, (byte) 0x67, (byte) 0x2c,
                                    (byte) 0x8a, (byte) 0x9e };

        byte[] bytes_utf8Output3 = {    (byte) 0xe6, (byte) 0x97, (byte) 0xa5, (byte) 0xe6,
                                        (byte) 0x9c, (byte) 0xac, (byte) 0xe8, (byte) 0xaa,
                                        (byte) 0x9e };        
        
        test(bytes_nihongo, "???", bytes_utf8Output2);
        */

        
        // Here's the bytes for the JIS encoding of the Japanese form of Nihongo
        byte[] bytes_nihongo = {    (byte) 0x1B, (byte) 0x24, (byte) 0x42, (byte) 0x46,
                                    (byte) 0x7C, (byte) 0x4B, (byte) 0x5C, (byte) 0x38,
                                    (byte) 0x6C, (byte) 0x1B, (byte) 0x28, (byte) 0x42,
                                    (byte) 0x1B, (byte) 0x28, (byte) 0x42 };
                                  
        byte[] bytes_utf8Output3 = {    (byte) 0xe6, (byte) 0x97, (byte) 0xa5, (byte) 0xe6,
                                        (byte) 0x9c, (byte) 0xac, (byte) 0xe8, (byte) 0xaa,
                                        (byte) 0x9e };        
        
        
        test(bytes_nihongo, "JIS", bytes_utf8Output3);
        
        
        try {        
            String string_TextWithAMixOfNewLineEncodings0 = 
                    new String("\r\nSee\r\n\n\r you \r\rlater!\n");
            
            byte[] bytes_TextWithAMixOfNewLineEncodings0 =
                    string_TextWithAMixOfNewLineEncodings0.getBytes("8859_1");

            byte[] bytes_utf8Output4 = {
                    (byte) 0x0a, (byte) 0x53, (byte) 0x65,
                    (byte) 0x65, (byte) 0x0a, (byte) 0x0a,
                    (byte) 0x0a, (byte) 0x20, (byte) 0x79,
                    (byte) 0x6f, (byte) 0x75, (byte) 0x20,
                    (byte) 0x0a, (byte) 0x0a, (byte) 0x6c, 
                    (byte) 0x61, (byte) 0x74, (byte) 0x65,
                    (byte) 0x72, (byte) 0x21, (byte) 0x0a };        
        
            test(bytes_TextWithAMixOfNewLineEncodings0, "8859_1", bytes_utf8Output4);


            String string_TextWithAMixOfNewLineEncodings1 = new String("\rSee\r\n\n\r you \r\rlater!\r");
                
            byte[] bytes_TextWithAMixOfNewLineEncodings1 =
                    string_TextWithAMixOfNewLineEncodings1.getBytes("8859_1");

            byte[] bytes_utf8Output5 = {
                    (byte) 0x0a, (byte) 0x53, (byte) 0x65,
                    (byte) 0x65, (byte) 0x0a, (byte) 0x0a,
                    (byte) 0x0a, (byte) 0x20, (byte) 0x79,
                    (byte) 0x6f, (byte) 0x75, (byte) 0x20,
                    (byte) 0x0a, (byte) 0x0a, (byte) 0x6c, 
                    (byte) 0x61, (byte) 0x74, (byte) 0x65,
                    (byte) 0x72, (byte) 0x21, (byte) 0x0a };                
        
            test(bytes_TextWithAMixOfNewLineEncodings1, "8859_1", bytes_utf8Output5);

        
            String string_TextWithAMixOfNewLineEncodings2 = new String("\rSee\r\n\n\r you \r\rlater\r!");
                
            byte[] bytes_TextWithAMixOfNewLineEncodings2 =
                    string_TextWithAMixOfNewLineEncodings2.getBytes("8859_1");

            byte[] bytes_utf8Output6 = {
                    (byte) 0x0a, (byte) 0x53, (byte) 0x65,
                    (byte) 0x65, (byte) 0x0a, (byte) 0x0a,
                    (byte) 0x0a, (byte) 0x20, (byte) 0x79,
                    (byte) 0x6f, (byte) 0x75, (byte) 0x20,
                    (byte) 0x0a, (byte) 0x0a, (byte) 0x6c, 
                    (byte) 0x61, (byte) 0x74, (byte) 0x65,
                    (byte) 0x72, (byte) 0x0a, (byte) 0x21 };
            
            test(bytes_TextWithAMixOfNewLineEncodings2, "8859_1", bytes_utf8Output6);
            
        } catch (UnsupportedEncodingException exception) {
            System.out.println("Encoding not supported.");
            exception.printStackTrace();
            return;
        }
        
        return;
    }
    
    
    
    public static void test(byte[] testBytes, String testBytesEncoding, byte[] expectedUTF8Output) {

        
        String testString = new String(testBytes);
        System.out.println("\n\ntestString: \"" + testString + "\"");

        byte[] bytesFromDoNull = doNull(testBytes);
        
        System.out.println("\nBytes from doNull():");    
        printBytes(bytesFromDoNull);
        
        String bytesFromDoNullString = new String( bytesFromDoNull );
        System.out.println("\ndoNull() result as a UCS-2 string: \""
                           + bytesFromDoNullString + "\"");

        
        byte[] bytesFromDoIdentity = doIdentity(testBytes, testBytesEncoding);
        
        System.out.println("\nBytes from doIdentity():");
        printBytes(bytesFromDoIdentity);

        System.out.println("\nExpected bytes from doIdentity():");
        printBytes(expectedUTF8Output);
        
        try {
            String bytesFromDoIdentityString = new String( bytesFromDoIdentity, "UTF8");
            System.out.println("\ndoIdentity() result as a UTF-8 string: \""
                                + bytesFromDoIdentityString + "\"");
            
        } catch (UnsupportedEncodingException exception) {
            System.out.println("Encoding not supported.");
            exception.printStackTrace();
        }
    }
    
    
    
    public static void printBytes(byte[] array) {
        /*
            Shamelessly copied, but modified slightly, from
            "http://java.sun.com/docs/books/tutorial/i18n/text/string.html".
        */
        
        for (int k = 0; k < array.length; k++) {
            System.out.print(byteToHex(array[k]) + " ");
        }
        
        System.out.println();
    }

    

    public static String byteToHex(byte b) {
        /*
            More shameless copying; this time from
            "http://java.sun.com/docs/books/tutorial/i18n/text/example-1dot1/UnicodeFormatter.java".
        */
        
        // Returns hex String representation of byte b
        char hexDigit[] = {
                '0', '1', '2', '3', '4', '5', '6', '7',
                '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
        };
        
        char[] array = { hexDigit[(b >> 4) & 0x0f], hexDigit[b & 0x0f] };
      
        return new String(array);
   }
    

}