Frames | No Frames |
1: /* Copyright (C) 1999, 2000 Free Software Foundation 2: 3: This file is part of libgcj. 4: 5: This software is copyrighted work licensed under the terms of the 6: Libgcj License. Please consult the file "LIBGCJ_LICENSE" for 7: details. */ 8: 9: package gnu.gcj.convert; 10: 11: /** 12: * Convert UTF8 to Unicode. 13: * @author Per Bothner <bothner@cygnus.com> 14: * @date March 1999. 15: */ 16: 17: public class Input_UTF8 extends BytesToUnicode 18: { 19: public String getName() { return "UTF8"; } 20: 21: int partial = 0; 22: int partial_bytes_expected = 0; 23: //int suggogate_second = -1; 24: 25: public int read (char[] outbuffer, int outpos, int count) 26: { 27: int origpos = outpos; 28: for (;;) 29: { 30: if (outpos - origpos >= count) 31: break; 32: if (inpos >= inlength) 33: break; 34: int b = inbuffer[inpos++]; 35: if (b >= 0) 36: outbuffer[outpos++] = (char) b; 37: else 38: { 39: if ((b & 0xC0) == 0x80) // Continuation byte 40: { 41: partial = (partial << 6) | (b & 0x3F); 42: --partial_bytes_expected; 43: if (partial_bytes_expected == 1) 44: { 45: if (partial > (0xFFFF>>6)) 46: { 47: // The next continuation byte will cause the result 48: // to exceed 0xFFFF, so we must use a surrogate pair. 49: // The "Unicode scalar value" (see D28 in section 3.7 50: // of the Unicode Standard 2.0) is defined as: 51: // value == (hi-0xD800)*0x400+(lo-0xDC00)+0x10000, 52: // where (hi, lo) is the Unicode surrogate pair. 53: // After reading the first three bytes, we have: 54: // partial == (value >> 6). 55: // Substituting and simplifying, we get: 56: // partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400. 57: // The definition lo>=0xDC00 && lo<=0xDFFF implies 58: // that (lo-0xDC00)>>6 is in the range 0..15. 59: // Hence we can solve for `hi' and we can emit 60: // the high-surrogate without waiting for the 61: // final byte: 62: outbuffer[outpos++] 63: = (char) (0xD800 + ((partial - 0x400) >> 4)); 64: 65: // Now we want to set it up so that when we read 66: // the final byte on the next iteration, we will 67: // get the low-surrogate without special handling. 68: // I.e. we want: 69: // lo == (next_partial << 6) | (next & 0x3F) 70: // where next is the next input byte and next_partial 71: // is the value of partial at the end of this 72: // iteration. This implies: next_partial == lo >> 6. 73: // We can simplify the previous: 74: // partial == (hi-0xD800)*0x10+((lo-0xDC00)>>6)+0x400, 75: // to: partial == (hi-0xD800)*0x10+(lo>>6)+0x90. 76: // Inserting the values of hi and next_partial, 77: // and simplifying, we get: partial == 78: // ( (partial-0x400)&~0xF) + next_partial + 0x90. 79: // Solving for next_partial, we get: 80: // next_partial = partial+0x400-0x90-(partial&~0xF): 81: // or: next_partial = (partial&0xF) + 0x370. Hence: 82: partial = (partial & 0xF) + 0x370; 83: } 84: } 85: else if (partial_bytes_expected == 0) 86: { 87: outbuffer[outpos++] = (char) partial; 88: partial = 0; 89: partial_bytes_expected = 0; 90: } 91: } 92: else // prefix byte 93: { 94: if ((b & 0xE0) == 0xC0) 95: { 96: partial = b & 0x1F; 97: partial_bytes_expected = 1; 98: } 99: else if ((b & 0xF0) == 0xE0) 100: { 101: partial = b & 0xF; 102: partial_bytes_expected = 2; 103: } 104: else 105: { 106: partial = b & 7; 107: partial_bytes_expected = 3; 108: } 109: } 110: } 111: } 112: return outpos - origpos; 113: } 114: }