1 | /* Copyright (C) 1999, 2000 Free Software Foundation
|
---|
2 |
|
---|
3 | This file is part of libgcj.
|
---|
4 |
|
---|
5 | This software is copyrighted work licensed under the terms of the
|
---|
6 | Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
|
---|
7 | details. */
|
---|
8 |
|
---|
9 | package gnu.gcj.convert;
|
---|
10 |
|
---|
11 | /**
|
---|
12 | * Convert Unicode to UTF8.
|
---|
13 | * @author Per Bothner <bothner@cygnus.com>
|
---|
14 | * @date Match 1999.
|
---|
15 | */
|
---|
16 |
|
---|
17 | public class Output_UTF8 extends UnicodeToBytes
|
---|
18 | {
|
---|
19 | public String getName() { return "UTF8"; }
|
---|
20 |
|
---|
21 | /** True if a surrogate pair should be emitted as a single UTF8 sequence.
|
---|
22 | * Otherwise, a surrogate pair is treated as two separate characters.
|
---|
23 | * Also, '\0' is emitted as {0} if true, and as {0xC0,0x80} if false. */
|
---|
24 | public boolean standardUTF8 = true;
|
---|
25 |
|
---|
26 | // Saves the previous char if it was a high-surrogate.
|
---|
27 | char hi_part;
|
---|
28 | // Value of incomplete character.
|
---|
29 | int value;
|
---|
30 | // Number of continuation bytes still to emit.
|
---|
31 | int bytes_todo;
|
---|
32 |
|
---|
33 | public int write (char[] inbuffer, int inpos, int inlength)
|
---|
34 | {
|
---|
35 | int start_pos = inpos;
|
---|
36 | int avail = buf.length - count;
|
---|
37 | for (;;)
|
---|
38 | {
|
---|
39 | if (avail == 0 || (inlength == 0 && bytes_todo == 0))
|
---|
40 | break;
|
---|
41 | // The algorithm is made more complicated because we want to write
|
---|
42 | // at least one byte in the output buffer, if there is room for
|
---|
43 | // that byte, and at least one input character is available.
|
---|
44 | // This makes the code more robust, since client code will
|
---|
45 | // always "make progress", even in the complicated cases,
|
---|
46 | // where the output buffer only has room for only *part* of a
|
---|
47 | // multi-byte sequence, or the input char buffer only has half
|
---|
48 | // of a surrogate pair (when standardUTF8 is set), or both.
|
---|
49 |
|
---|
50 | // Handle continuation characters we did not have room for before.
|
---|
51 | if (bytes_todo > 0)
|
---|
52 | {
|
---|
53 | do
|
---|
54 | {
|
---|
55 | bytes_todo--;
|
---|
56 | buf[count++] = (byte)
|
---|
57 | (((value >> (bytes_todo * 6)) & 0x3F) | 0x80);
|
---|
58 | avail--;
|
---|
59 | }
|
---|
60 | while (bytes_todo > 0 && avail > 0);
|
---|
61 | continue;
|
---|
62 | }
|
---|
63 |
|
---|
64 | char ch = inbuffer[inpos++];
|
---|
65 | inlength--;
|
---|
66 |
|
---|
67 | if ((hi_part != 0 && (ch <= 0xDBFF || ch > 0xDFFF))
|
---|
68 | || (hi_part == 0 && ch >= 0xDC00 && ch <= 0xDFFF))
|
---|
69 | {
|
---|
70 | // If the previous character was a high surrogate, and we
|
---|
71 | // don't now have a low surrogate, we print the high
|
---|
72 | // surrogate as an isolated character. If this character
|
---|
73 | // is a low surrogate and we didn't previously see a high
|
---|
74 | // surrogate, we do the same thing.
|
---|
75 | --inpos;
|
---|
76 | ++inlength;
|
---|
77 | buf[count++] = (byte) (0xE0 | (hi_part >> 12));
|
---|
78 | value = hi_part;
|
---|
79 | hi_part = 0;
|
---|
80 | avail--;
|
---|
81 | bytes_todo = 2;
|
---|
82 | }
|
---|
83 | else if (ch < 128 && (ch != 0 || standardUTF8))
|
---|
84 | {
|
---|
85 | avail--;
|
---|
86 | buf[count++] = (byte) ch;
|
---|
87 | }
|
---|
88 | else if (ch <= 0x07FF)
|
---|
89 | {
|
---|
90 | buf[count++] = (byte) (0xC0 | (ch >> 6));
|
---|
91 | avail--;
|
---|
92 | value = ch;
|
---|
93 | bytes_todo = 1;
|
---|
94 | }
|
---|
95 | else if (ch >= 0xD800 && ch <= 0xDFFF && standardUTF8)
|
---|
96 | {
|
---|
97 | if (ch <= 0xDBFF) // High surrogates
|
---|
98 | {
|
---|
99 | // Just save the high surrogate until the next
|
---|
100 | // character comes along.
|
---|
101 | hi_part = ch;
|
---|
102 | }
|
---|
103 | else // Low surrogates
|
---|
104 | {
|
---|
105 | value = (hi_part - 0xD800) * 0x400 + (ch - 0xDC00) + 0x10000;
|
---|
106 | buf[count++] = (byte) (0xF0 | (value >> 18));
|
---|
107 | bytes_todo = 3;
|
---|
108 | hi_part = 0;
|
---|
109 | }
|
---|
110 | }
|
---|
111 | else
|
---|
112 | {
|
---|
113 | buf[count++] = (byte) (0xE0 | (ch >> 12));
|
---|
114 | value = ch;
|
---|
115 | avail--;
|
---|
116 | bytes_todo = 2;
|
---|
117 | }
|
---|
118 | }
|
---|
119 | return inpos - start_pos;
|
---|
120 | }
|
---|
121 | }
|
---|