001    // Copyright 2006-2007 Regents of the University of California.  May be used 
002    // under the terms of the revised BSD license.  See LICENSING for details.
003    package org.joe_e.charset;
004    
005    import java.nio.charset.Charset;
006    import java.nio.ByteBuffer;
007    import java.io.InputStream;
008    import java.io.InputStreamReader;
009    import java.io.OutputStream;
010    import java.io.OutputStreamWriter;
011    import java.io.Reader;
012    import java.io.Writer;
013    
014    /**
015     * UTF-8 I/O.
016     */
017    public final class UTF8 {
018        private static final Charset charset = Charset.forName("UTF-8");
019        
020        private UTF8() {}
021        
022        /**
023         * Encodes a string in UTF-8.
024         * @param text  The text to encode.
025         * @return The UTF-8 bytes.
026         */
027        static public byte[] encode(final String text) {
028            final ByteBuffer bytes = charset.encode(text);
029            final int len = bytes.limit();
030            final byte[] v = bytes.array();
031            if (len == v.length) { return v; }
032            final byte[] r = new byte[len];
033            System.arraycopy(v, bytes.arrayOffset(), r, 0, len);
034            return r;
035        }
036        
037        /**
038         * Decodes a UTF-8 string. Each byte not corresponding to a UTF-8
039         * character decodes to the Unicode replacement character U+FFFD.
040         * Note that an initial byte-order mark is not stripped.  This method is
041         * equivalent to <code>decode(buffer, 0, buffer.length)</code>.
042         * @param buffer    the ASCII-encoded string to decode
043         * @return The corresponding string
044         * @throws java.lang.IndexOutOfBoundsException
045         */
046        static public String decode(byte[] buffer) {
047            return decode(buffer, 0, buffer.length);
048        }
049        
050        /**
051         * Decodes a UTF-8 string. Each byte not corresponding to a UTF-8
052         * character decodes to the Unicode replacement character U+FFFD.
053         * Note that an initial byte-order mark is not stripped.
054         * @param buffer    the ASCII-encoded string to decode
055         * @param off       where to start decoding
056         * @param len       how many bytes to decode
057         * @return The corresponding string
058         * @throws java.lang.IndexOutOfBoundsException
059         */
060        static public String decode(byte[] buffer, int off, int len) {
061            return charset.decode(ByteBuffer.wrap(buffer, off, len)).toString();
062        }
063        
064        /**
065         * Constructs a UTF-8 reader.
066         * @param in    The binary input stream.
067         */
068        static public Reader input(final InputStream in) {
069            return new InputStreamReader(in, charset);
070        }
071        
072        /**
073         * Constructs a UTF-8 writer.
074         * @param out   The binary output stream.
075         */
076        static public Writer output(final OutputStream out) {
077            return new OutputStreamWriter(out, charset);
078        }
079    }