Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/src/gcc/libjava/gnu/gcj/convert/Output_UTF8.java@ 2

Visit:

Last change on this file since 2 was 2, checked in by bird, 22 years ago
Initial revision
Property cvs2svn:cvs-rev set to `1.1` Property svn:eol-style set to `native` Property svn:executable set to ``*
File size: 3.2 KB

Line
1	/* Copyright (C) 1999, 2000 Free Software Foundation
2
3	This file is part of libgcj.
4
5	This software is copyrighted work licensed under the terms of the
6	Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
7	details. */
8
9	package gnu.gcj.convert;
10
11	/**
12	* Convert Unicode to UTF8.
13	* @author Per Bothner <bothner@cygnus.com>
14	* @date Match 1999.
15	*/
16
17	public class Output_UTF8 extends UnicodeToBytes
18	{
19	public String getName() { return "UTF8"; }
20
21	/** True if a surrogate pair should be emitted as a single UTF8 sequence.
22	* Otherwise, a surrogate pair is treated as two separate characters.
23	* Also, '\0' is emitted as {0} if true, and as {0xC0,0x80} if false. */
24	public boolean standardUTF8 = true;
25
26	// Saves the previous char if it was a high-surrogate.
27	char hi_part;
28	// Value of incomplete character.
29	int value;
30	// Number of continuation bytes still to emit.
31	int bytes_todo;
32
33	public int write (char[] inbuffer, int inpos, int inlength)
34	{
35	int start_pos = inpos;
36	int avail = buf.length - count;
37	for (;;)
38	{
39	if (avail == 0 \|\| (inlength == 0 && bytes_todo == 0))
40	break;
41	// The algorithm is made more complicated because we want to write
42	// at least one byte in the output buffer, if there is room for
43	// that byte, and at least one input character is available.
44	// This makes the code more robust, since client code will
45	// always "make progress", even in the complicated cases,
46	// where the output buffer only has room for only part of a
47	// multi-byte sequence, or the input char buffer only has half
48	// of a surrogate pair (when standardUTF8 is set), or both.
49
50	// Handle continuation characters we did not have room for before.
51	if (bytes_todo > 0)
52	{
53	do
54	{
55	bytes_todo--;
56	buf[count++] = (byte)
57	(((value >> (bytes_todo * 6)) & 0x3F) \| 0x80);
58	avail--;
59	}
60	while (bytes_todo > 0 && avail > 0);
61	continue;
62	}
63
64	char ch = inbuffer[inpos++];
65	inlength--;
66
67	if ((hi_part != 0 && (ch <= 0xDBFF \|\| ch > 0xDFFF))
68	\|\| (hi_part == 0 && ch >= 0xDC00 && ch <= 0xDFFF))
69	{
70	// If the previous character was a high surrogate, and we
71	// don't now have a low surrogate, we print the high
72	// surrogate as an isolated character. If this character
73	// is a low surrogate and we didn't previously see a high
74	// surrogate, we do the same thing.
75	--inpos;
76	++inlength;
77	buf[count++] = (byte) (0xE0 \| (hi_part >> 12));
78	value = hi_part;
79	hi_part = 0;
80	avail--;
81	bytes_todo = 2;
82	}
83	else if (ch < 128 && (ch != 0 \|\| standardUTF8))
84	{
85	avail--;
86	buf[count++] = (byte) ch;
87	}
88	else if (ch <= 0x07FF)
89	{
90	buf[count++] = (byte) (0xC0 \| (ch >> 6));
91	avail--;
92	value = ch;
93	bytes_todo = 1;
94	}
95	else if (ch >= 0xD800 && ch <= 0xDFFF && standardUTF8)
96	{
97	if (ch <= 0xDBFF) // High surrogates
98	{
99	// Just save the high surrogate until the next
100	// character comes along.
101	hi_part = ch;
102	}
103	else // Low surrogates
104	{
105	value = (hi_part - 0xD800) * 0x400 + (ch - 0xDC00) + 0x10000;
106	buf[count++] = (byte) (0xF0 \| (value >> 18));
107	bytes_todo = 3;
108	hi_part = 0;
109	}
110	}
111	else
112	{
113	buf[count++] = (byte) (0xE0 \| (ch >> 12));
114	value = ch;
115	avail--;
116	bytes_todo = 2;
117	}
118	}
119	return inpos - start_pos;
120	}
121	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: