1 | // RuleBasedCollator.java - Concrete class for locale-based string compare.
|
---|
2 |
|
---|
3 | /* Copyright (C) 1999, 2000, 2001 Free Software Foundation
|
---|
4 |
|
---|
5 | This file is part of libgcj.
|
---|
6 |
|
---|
7 | This software is copyrighted work licensed under the terms of the
|
---|
8 | Libgcj License. Please consult the file "LIBGCJ_LICENSE" for
|
---|
9 | details. */
|
---|
10 |
|
---|
11 | package java.text;
|
---|
12 |
|
---|
13 | import java.util.Enumeration;
|
---|
14 | import java.util.Hashtable;
|
---|
15 | import java.util.Vector;
|
---|
16 |
|
---|
17 | /**
|
---|
18 | * @author Tom Tromey <tromey@cygnus.com>
|
---|
19 | * @date March 25, 1999
|
---|
20 | */
|
---|
21 | /* Written using "Java Class Libraries", 2nd edition, plus online
|
---|
22 | * API docs for JDK 1.2 from http://www.javasoft.com.
|
---|
23 | * Status: Believed complete and correct
|
---|
24 | */
|
---|
25 |
|
---|
26 | final class RBCElement
|
---|
27 | {
|
---|
28 | String key;
|
---|
29 | char relation;
|
---|
30 |
|
---|
31 | RBCElement (String key, char relation)
|
---|
32 | {
|
---|
33 | this.key = key;
|
---|
34 | this.relation = relation;
|
---|
35 | }
|
---|
36 | }
|
---|
37 |
|
---|
38 | public class RuleBasedCollator extends Collator
|
---|
39 | {
|
---|
40 | public Object clone ()
|
---|
41 | {
|
---|
42 | RuleBasedCollator c = (RuleBasedCollator) super.clone ();
|
---|
43 | c.map = (Hashtable) map.clone ();
|
---|
44 | c.prefixes = (Hashtable) map.clone ();
|
---|
45 | return c;
|
---|
46 | }
|
---|
47 |
|
---|
48 | // A helper for CollationElementIterator.next().
|
---|
49 | int ceiNext (CollationElementIterator cei)
|
---|
50 | {
|
---|
51 | if (cei.lookahead_set)
|
---|
52 | {
|
---|
53 | cei.lookahead_set = false;
|
---|
54 | return cei.lookahead;
|
---|
55 | }
|
---|
56 |
|
---|
57 | int save = cei.index;
|
---|
58 | int max = cei.text.length();
|
---|
59 | String s = null;
|
---|
60 |
|
---|
61 | // It is possible to have a case where `abc' has a mapping, but
|
---|
62 | // neither `ab' nor `abd' do. In this case we must treat `abd' as
|
---|
63 | // nothing special.
|
---|
64 | boolean found = false;
|
---|
65 |
|
---|
66 | int i;
|
---|
67 | for (i = save + 1; i <= max; ++i)
|
---|
68 | {
|
---|
69 | s = cei.text.substring(save, i);
|
---|
70 | if (prefixes.get(s) == null)
|
---|
71 | break;
|
---|
72 | found = true;
|
---|
73 | }
|
---|
74 | // Assume s != null.
|
---|
75 |
|
---|
76 | Object obj = map.get(s);
|
---|
77 | // The special case.
|
---|
78 | while (found && obj == null && s.length() > 1)
|
---|
79 | {
|
---|
80 | --i;
|
---|
81 | s = cei.text.substring(save, i);
|
---|
82 | obj = map.get(s);
|
---|
83 | }
|
---|
84 |
|
---|
85 | // Update state.
|
---|
86 | cei.index = i;
|
---|
87 |
|
---|
88 | if (obj == null)
|
---|
89 | {
|
---|
90 | // This idea, and the values, come from JDK.
|
---|
91 | // assert (s.length() == 1)
|
---|
92 | cei.lookahead_set = true;
|
---|
93 | cei.lookahead = s.charAt(0) << 8;
|
---|
94 | return 0x7fff << 16;
|
---|
95 | }
|
---|
96 |
|
---|
97 | return ((Integer) obj).intValue();
|
---|
98 | }
|
---|
99 |
|
---|
100 | // A helper for compareTo() that returns the next character that has
|
---|
101 | // a nonzero ordering at the indicated strength. This is also used
|
---|
102 | // in CollationKey.
|
---|
103 | static final int next (CollationElementIterator iter, int strength)
|
---|
104 | {
|
---|
105 | while (true)
|
---|
106 | {
|
---|
107 | int os = iter.next();
|
---|
108 | if (os == CollationElementIterator.NULLORDER)
|
---|
109 | return os;
|
---|
110 | int c = 0;
|
---|
111 | switch (strength)
|
---|
112 | {
|
---|
113 | case PRIMARY:
|
---|
114 | c = os & ~0xffff;
|
---|
115 | break;
|
---|
116 | case SECONDARY:
|
---|
117 | c = os & ~0x00ff;
|
---|
118 | break;
|
---|
119 | case TERTIARY:
|
---|
120 | case IDENTICAL:
|
---|
121 | c = os;
|
---|
122 | break;
|
---|
123 | }
|
---|
124 | if (c != 0)
|
---|
125 | return c;
|
---|
126 | }
|
---|
127 | }
|
---|
128 |
|
---|
129 | public int compare (String source, String target)
|
---|
130 | {
|
---|
131 | CollationElementIterator cs, ct;
|
---|
132 |
|
---|
133 | cs = new CollationElementIterator (source, this);
|
---|
134 | ct = new CollationElementIterator (target, this);
|
---|
135 |
|
---|
136 | while (true)
|
---|
137 | {
|
---|
138 | int os = next (cs, strength);
|
---|
139 | int ot = next (ct, strength);
|
---|
140 |
|
---|
141 | if (os == CollationElementIterator.NULLORDER
|
---|
142 | && ot == CollationElementIterator.NULLORDER)
|
---|
143 | break;
|
---|
144 | else if (os == CollationElementIterator.NULLORDER)
|
---|
145 | {
|
---|
146 | // Source string is shorter, so return "less than".
|
---|
147 | return -1;
|
---|
148 | }
|
---|
149 | else if (ot == CollationElementIterator.NULLORDER)
|
---|
150 | {
|
---|
151 | // Target string is shorter, so return "greater than".
|
---|
152 | return 1;
|
---|
153 | }
|
---|
154 |
|
---|
155 | if (os != ot)
|
---|
156 | return os - ot;
|
---|
157 | }
|
---|
158 |
|
---|
159 | return 0;
|
---|
160 | }
|
---|
161 |
|
---|
162 | public boolean equals (Object obj)
|
---|
163 | {
|
---|
164 | if (! (obj instanceof RuleBasedCollator) || ! super.equals(obj))
|
---|
165 | return false;
|
---|
166 | RuleBasedCollator rbc = (RuleBasedCollator) obj;
|
---|
167 | // FIXME: this is probably wrong. Instead we should compare maps
|
---|
168 | // directly.
|
---|
169 | return (frenchAccents == rbc.frenchAccents
|
---|
170 | && rules.equals(rbc.rules));
|
---|
171 | }
|
---|
172 |
|
---|
173 | public CollationElementIterator getCollationElementIterator (String source)
|
---|
174 | {
|
---|
175 | StringBuffer expand = new StringBuffer (source.length());
|
---|
176 | int max = source.length();
|
---|
177 | for (int i = 0; i < max; ++i)
|
---|
178 | decomposeCharacter (source.charAt(i), expand);
|
---|
179 | return new CollationElementIterator (expand.toString(), this);
|
---|
180 | }
|
---|
181 |
|
---|
182 | public CollationElementIterator getCollationElementIterator (CharacterIterator source)
|
---|
183 | {
|
---|
184 | StringBuffer expand = new StringBuffer ();
|
---|
185 | for (char c = source.first ();
|
---|
186 | c != CharacterIterator.DONE;
|
---|
187 | c = source.next ())
|
---|
188 | decomposeCharacter (c, expand);
|
---|
189 |
|
---|
190 | return new CollationElementIterator (expand.toString(), this);
|
---|
191 | }
|
---|
192 |
|
---|
193 | public CollationKey getCollationKey (String source)
|
---|
194 | {
|
---|
195 | return new CollationKey (getCollationElementIterator (source), source,
|
---|
196 | strength);
|
---|
197 | }
|
---|
198 |
|
---|
199 | public String getRules ()
|
---|
200 | {
|
---|
201 | return rules;
|
---|
202 | }
|
---|
203 |
|
---|
204 | public int hashCode ()
|
---|
205 | {
|
---|
206 | return (frenchAccents ? 1231 : 1237
|
---|
207 | ^ rules.hashCode()
|
---|
208 | ^ map.hashCode()
|
---|
209 | ^ prefixes.hashCode());
|
---|
210 | }
|
---|
211 |
|
---|
212 | private final boolean is_special (char c)
|
---|
213 | {
|
---|
214 | // Rules from JCL book.
|
---|
215 | return ((c >= 0x0009 && c <= 0x000d)
|
---|
216 | || (c >= 0x0020 && c <= 0x002f)
|
---|
217 | || (c >= 0x003a && c <= 0x0040)
|
---|
218 | || (c >= 0x005b && c <= 0x0060)
|
---|
219 | || (c >= 0x007b && c <= 0x007e));
|
---|
220 | }
|
---|
221 |
|
---|
222 | private final int text_argument (String rules, int index,
|
---|
223 | StringBuffer result)
|
---|
224 | {
|
---|
225 | result.setLength(0);
|
---|
226 | int len = rules.length();
|
---|
227 | while (index < len)
|
---|
228 | {
|
---|
229 | char c = rules.charAt(index);
|
---|
230 | if (c == '\'' && index + 2 < len
|
---|
231 | && rules.charAt(index + 2) == '\''
|
---|
232 | && is_special (rules.charAt(index + 1)))
|
---|
233 | index += 2;
|
---|
234 | else if (is_special (c) || Character.isWhitespace(c))
|
---|
235 | return index;
|
---|
236 | result.append(c);
|
---|
237 | ++index;
|
---|
238 | }
|
---|
239 | return index;
|
---|
240 | }
|
---|
241 |
|
---|
242 | public RuleBasedCollator (String rules) throws ParseException
|
---|
243 | {
|
---|
244 | this.rules = rules;
|
---|
245 | this.frenchAccents = false;
|
---|
246 |
|
---|
247 | // We keep each rule in order in a vector. At the end we traverse
|
---|
248 | // the vector and compute collation values from it.
|
---|
249 | int insertion_index = 0;
|
---|
250 | Vector vec = new Vector ();
|
---|
251 |
|
---|
252 | StringBuffer argument = new StringBuffer ();
|
---|
253 |
|
---|
254 | int len = rules.length();
|
---|
255 | for (int index = 0; index < len; ++index)
|
---|
256 | {
|
---|
257 | char c = rules.charAt(index);
|
---|
258 |
|
---|
259 | // Just skip whitespace.
|
---|
260 | if (Character.isWhitespace(c))
|
---|
261 | continue;
|
---|
262 |
|
---|
263 | // Modifier.
|
---|
264 | if (c == '@')
|
---|
265 | {
|
---|
266 | frenchAccents = true;
|
---|
267 | continue;
|
---|
268 | }
|
---|
269 |
|
---|
270 | // Check for relation or reset operator.
|
---|
271 | if (! (c == '<' || c == ';' || c == ',' || c == '=' || c == '&'))
|
---|
272 | throw new ParseException ("invalid character", index);
|
---|
273 |
|
---|
274 | ++index;
|
---|
275 | while (index < len)
|
---|
276 | {
|
---|
277 | if (! Character.isWhitespace(rules.charAt(index)))
|
---|
278 | break;
|
---|
279 | ++index;
|
---|
280 | }
|
---|
281 | if (index == len)
|
---|
282 | throw new ParseException ("missing argument", index);
|
---|
283 |
|
---|
284 | int save = index;
|
---|
285 | index = text_argument (rules, index, argument);
|
---|
286 | if (argument.length() == 0)
|
---|
287 | throw new ParseException ("invalid character", save);
|
---|
288 | String arg = argument.toString();
|
---|
289 | int item_index = vec.indexOf(arg);
|
---|
290 | if (c != '&')
|
---|
291 | {
|
---|
292 | // If the argument already appears in the vector, then we
|
---|
293 | // must remove it in order to re-order.
|
---|
294 | if (item_index != -1)
|
---|
295 | {
|
---|
296 | vec.removeElementAt(item_index);
|
---|
297 | if (insertion_index >= item_index)
|
---|
298 | --insertion_index;
|
---|
299 | }
|
---|
300 | RBCElement r = new RBCElement (arg, c);
|
---|
301 | vec.insertElementAt(r, insertion_index);
|
---|
302 | ++insertion_index;
|
---|
303 | }
|
---|
304 | else
|
---|
305 | {
|
---|
306 | // Reset.
|
---|
307 | if (item_index == -1)
|
---|
308 | throw
|
---|
309 | new ParseException ("argument to reset not previously seen",
|
---|
310 | save);
|
---|
311 | insertion_index = item_index + 1;
|
---|
312 | }
|
---|
313 |
|
---|
314 | // Ugly: in this case the resulting INDEX comes from
|
---|
315 | // text_argument, which returns the index of the next
|
---|
316 | // character we should examine.
|
---|
317 | --index;
|
---|
318 | }
|
---|
319 |
|
---|
320 | // Now construct a hash table that maps strings onto their
|
---|
321 | // collation values.
|
---|
322 | int primary = 0;
|
---|
323 | int secondary = 0;
|
---|
324 | int tertiary = 0;
|
---|
325 | this.map = new Hashtable ();
|
---|
326 | this.prefixes = new Hashtable ();
|
---|
327 | Enumeration e = vec.elements();
|
---|
328 | while (e.hasMoreElements())
|
---|
329 | {
|
---|
330 | RBCElement r = (RBCElement) e.nextElement();
|
---|
331 | switch (r.relation)
|
---|
332 | {
|
---|
333 | case '<':
|
---|
334 | ++primary;
|
---|
335 | secondary = 0;
|
---|
336 | tertiary = 0;
|
---|
337 | break;
|
---|
338 | case ';':
|
---|
339 | ++secondary;
|
---|
340 | tertiary = 0;
|
---|
341 | break;
|
---|
342 | case ',':
|
---|
343 | ++tertiary;
|
---|
344 | break;
|
---|
345 | case '=':
|
---|
346 | break;
|
---|
347 | }
|
---|
348 | // This must match CollationElementIterator.
|
---|
349 | map.put(r.key, new Integer (primary << 16
|
---|
350 | | secondary << 8 | tertiary));
|
---|
351 |
|
---|
352 | // Make a map of all lookaheads we might need.
|
---|
353 | for (int i = r.key.length() - 1; i >= 1; --i)
|
---|
354 | prefixes.put(r.key.substring(0, i), Boolean.TRUE);
|
---|
355 | }
|
---|
356 | }
|
---|
357 |
|
---|
358 | // True if we are using French-style accent ordering.
|
---|
359 | private boolean frenchAccents;
|
---|
360 |
|
---|
361 | // It's easier to just save the rules than to try to recreate them.
|
---|
362 | private String rules;
|
---|
363 |
|
---|
364 | // This maps strings onto collation values.
|
---|
365 | private Hashtable map;
|
---|
366 | // An entry in this hash means that more lookahead is required for
|
---|
367 | // the prefix string.
|
---|
368 | private Hashtable prefixes;
|
---|
369 | }
|
---|