source: trunk/src/gcc/libjava/gnu/gcj/convert/Output_UTF8.java@ 2

Last change on this file since 2 was 2, checked in by bird, 22 years ago

Initial revision

  • Property cvs2svn:cvs-rev set to 1.1
  • Property svn:eol-style set to native
  • Property svn:executable set to *
File size: 3.2 KB
Line 
1/* Copyright (C) 1999, 2000 Free Software Foundation
2
3 This file is part of libgcj.
4
5This software is copyrighted work licensed under the terms of the
6Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
7details. */
8
9package gnu.gcj.convert;
10
11/**
12 * Convert Unicode to UTF8.
13 * @author Per Bothner <bothner@cygnus.com>
14 * @date Match 1999.
15 */
16
17public class Output_UTF8 extends UnicodeToBytes
18{
19 public String getName() { return "UTF8"; }
20
21 /** True if a surrogate pair should be emitted as a single UTF8 sequence.
22 * Otherwise, a surrogate pair is treated as two separate characters.
23 * Also, '\0' is emitted as {0} if true, and as {0xC0,0x80} if false. */
24 public boolean standardUTF8 = true;
25
26 // Saves the previous char if it was a high-surrogate.
27 char hi_part;
28 // Value of incomplete character.
29 int value;
30 // Number of continuation bytes still to emit.
31 int bytes_todo;
32
33 public int write (char[] inbuffer, int inpos, int inlength)
34 {
35 int start_pos = inpos;
36 int avail = buf.length - count;
37 for (;;)
38 {
39 if (avail == 0 || (inlength == 0 && bytes_todo == 0))
40 break;
41 // The algorithm is made more complicated because we want to write
42 // at least one byte in the output buffer, if there is room for
43 // that byte, and at least one input character is available.
44 // This makes the code more robust, since client code will
45 // always "make progress", even in the complicated cases,
46 // where the output buffer only has room for only *part* of a
47 // multi-byte sequence, or the input char buffer only has half
48 // of a surrogate pair (when standardUTF8 is set), or both.
49
50 // Handle continuation characters we did not have room for before.
51 if (bytes_todo > 0)
52 {
53 do
54 {
55 bytes_todo--;
56 buf[count++] = (byte)
57 (((value >> (bytes_todo * 6)) & 0x3F) | 0x80);
58 avail--;
59 }
60 while (bytes_todo > 0 && avail > 0);
61 continue;
62 }
63
64 char ch = inbuffer[inpos++];
65 inlength--;
66
67 if ((hi_part != 0 && (ch <= 0xDBFF || ch > 0xDFFF))
68 || (hi_part == 0 && ch >= 0xDC00 && ch <= 0xDFFF))
69 {
70 // If the previous character was a high surrogate, and we
71 // don't now have a low surrogate, we print the high
72 // surrogate as an isolated character. If this character
73 // is a low surrogate and we didn't previously see a high
74 // surrogate, we do the same thing.
75 --inpos;
76 ++inlength;
77 buf[count++] = (byte) (0xE0 | (hi_part >> 12));
78 value = hi_part;
79 hi_part = 0;
80 avail--;
81 bytes_todo = 2;
82 }
83 else if (ch < 128 && (ch != 0 || standardUTF8))
84 {
85 avail--;
86 buf[count++] = (byte) ch;
87 }
88 else if (ch <= 0x07FF)
89 {
90 buf[count++] = (byte) (0xC0 | (ch >> 6));
91 avail--;
92 value = ch;
93 bytes_todo = 1;
94 }
95 else if (ch >= 0xD800 && ch <= 0xDFFF && standardUTF8)
96 {
97 if (ch <= 0xDBFF) // High surrogates
98 {
99 // Just save the high surrogate until the next
100 // character comes along.
101 hi_part = ch;
102 }
103 else // Low surrogates
104 {
105 value = (hi_part - 0xD800) * 0x400 + (ch - 0xDC00) + 0x10000;
106 buf[count++] = (byte) (0xF0 | (value >> 18));
107 bytes_todo = 3;
108 hi_part = 0;
109 }
110 }
111 else
112 {
113 buf[count++] = (byte) (0xE0 | (ch >> 12));
114 value = ch;
115 avail--;
116 bytes_todo = 2;
117 }
118 }
119 return inpos - start_pos;
120 }
121}
Note: See TracBrowser for help on using the repository browser.