source: coreutils/trunk/lib/regex_internal.c@ 1648

Last change on this file since 1648 was 1648, checked in by Silvan Scherrer, 9 years ago

coreutils: update trunk to version 8.25

File size: 47.6 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU General Public
8 License as published by the Free Software Foundation; either
9 version 3 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 General Public License for more details.
15
16 You should have received a copy of the GNU General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20#include "verify.h"
21#include "intprops.h"
22static void re_string_construct_common (const char *str, Idx len,
23 re_string_t *pstr,
24 RE_TRANSLATE_TYPE trans, bool icase,
25 const re_dfa_t *dfa) internal_function;
26static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
27 const re_node_set *nodes,
28 re_hashval_t hash) internal_function;
29static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
30 const re_node_set *nodes,
31 unsigned int context,
32 re_hashval_t hash) internal_function;
33
34
35/* Functions for string operation. */
36
37/* This function allocate the buffers. It is necessary to call
38 re_string_reconstruct before using the object. */
39
40static reg_errcode_t
41internal_function __attribute_warn_unused_result__
42re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
43 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
44{
45 reg_errcode_t ret;
46 Idx init_buf_len;
47
48 /* Ensure at least one character fits into the buffers. */
49 if (init_len < dfa->mb_cur_max)
50 init_len = dfa->mb_cur_max;
51 init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
52 re_string_construct_common (str, len, pstr, trans, icase, dfa);
53
54 ret = re_string_realloc_buffers (pstr, init_buf_len);
55 if (BE (ret != REG_NOERROR, 0))
56 return ret;
57
58 pstr->word_char = dfa->word_char;
59 pstr->word_ops_used = dfa->word_ops_used;
60 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
61 pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
62 pstr->valid_raw_len = pstr->valid_len;
63 return REG_NOERROR;
64}
65
66/* This function allocate the buffers, and initialize them. */
67
68static reg_errcode_t
69internal_function __attribute_warn_unused_result__
70re_string_construct (re_string_t *pstr, const char *str, Idx len,
71 RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
72{
73 reg_errcode_t ret;
74 memset (pstr, '\0', sizeof (re_string_t));
75 re_string_construct_common (str, len, pstr, trans, icase, dfa);
76
77 if (len > 0)
78 {
79 ret = re_string_realloc_buffers (pstr, len + 1);
80 if (BE (ret != REG_NOERROR, 0))
81 return ret;
82 }
83 pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
84
85 if (icase)
86 {
87#ifdef RE_ENABLE_I18N
88 if (dfa->mb_cur_max > 1)
89 {
90 while (1)
91 {
92 ret = build_wcs_upper_buffer (pstr);
93 if (BE (ret != REG_NOERROR, 0))
94 return ret;
95 if (pstr->valid_raw_len >= len)
96 break;
97 if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
98 break;
99 ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
100 if (BE (ret != REG_NOERROR, 0))
101 return ret;
102 }
103 }
104 else
105#endif /* RE_ENABLE_I18N */
106 build_upper_buffer (pstr);
107 }
108 else
109 {
110#ifdef RE_ENABLE_I18N
111 if (dfa->mb_cur_max > 1)
112 build_wcs_buffer (pstr);
113 else
114#endif /* RE_ENABLE_I18N */
115 {
116 if (trans != NULL)
117 re_string_translate_buffer (pstr);
118 else
119 {
120 pstr->valid_len = pstr->bufs_len;
121 pstr->valid_raw_len = pstr->bufs_len;
122 }
123 }
124 }
125
126 return REG_NOERROR;
127}
128
129/* Helper functions for re_string_allocate, and re_string_construct. */
130
131static reg_errcode_t
132internal_function __attribute_warn_unused_result__
133re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
134{
135#ifdef RE_ENABLE_I18N
136 if (pstr->mb_cur_max > 1)
137 {
138 wint_t *new_wcs;
139
140 /* Avoid overflow in realloc. */
141 const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
142 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
143 return REG_ESPACE;
144
145 new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
146 if (BE (new_wcs == NULL, 0))
147 return REG_ESPACE;
148 pstr->wcs = new_wcs;
149 if (pstr->offsets != NULL)
150 {
151 Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
152 if (BE (new_offsets == NULL, 0))
153 return REG_ESPACE;
154 pstr->offsets = new_offsets;
155 }
156 }
157#endif /* RE_ENABLE_I18N */
158 if (pstr->mbs_allocated)
159 {
160 unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
161 new_buf_len);
162 if (BE (new_mbs == NULL, 0))
163 return REG_ESPACE;
164 pstr->mbs = new_mbs;
165 }
166 pstr->bufs_len = new_buf_len;
167 return REG_NOERROR;
168}
169
170
171static void
172internal_function
173re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
174 RE_TRANSLATE_TYPE trans, bool icase,
175 const re_dfa_t *dfa)
176{
177 pstr->raw_mbs = (const unsigned char *) str;
178 pstr->len = len;
179 pstr->raw_len = len;
180 pstr->trans = trans;
181 pstr->icase = icase;
182 pstr->mbs_allocated = (trans != NULL || icase);
183 pstr->mb_cur_max = dfa->mb_cur_max;
184 pstr->is_utf8 = dfa->is_utf8;
185 pstr->map_notascii = dfa->map_notascii;
186 pstr->stop = pstr->len;
187 pstr->raw_stop = pstr->stop;
188}
189
190#ifdef RE_ENABLE_I18N
191
192/* Build wide character buffer PSTR->WCS.
193 If the byte sequence of the string are:
194 <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
195 Then wide character buffer will be:
196 <wc1> , WEOF , <wc2> , WEOF , <wc3>
197 We use WEOF for padding, they indicate that the position isn't
198 a first byte of a multibyte character.
199
200 Note that this function assumes PSTR->VALID_LEN elements are already
201 built and starts from PSTR->VALID_LEN. */
202
203static void
204internal_function
205build_wcs_buffer (re_string_t *pstr)
206{
207#ifdef _LIBC
208 unsigned char buf[MB_LEN_MAX];
209 assert (MB_LEN_MAX >= pstr->mb_cur_max);
210#else
211 unsigned char buf[64];
212#endif
213 mbstate_t prev_st;
214 Idx byte_idx, end_idx, remain_len;
215 size_t mbclen;
216
217 /* Build the buffers from pstr->valid_len to either pstr->len or
218 pstr->bufs_len. */
219 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
220 for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
221 {
222 wchar_t wc;
223 const char *p;
224
225 remain_len = end_idx - byte_idx;
226 prev_st = pstr->cur_state;
227 /* Apply the translation if we need. */
228 if (BE (pstr->trans != NULL, 0))
229 {
230 int i, ch;
231
232 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
233 {
234 ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
235 buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
236 }
237 p = (const char *) buf;
238 }
239 else
240 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
241 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
242 if (BE (mbclen == (size_t) -1 || mbclen == 0
243 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
244 {
245 /* We treat these cases as a singlebyte character. */
246 mbclen = 1;
247 wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
248 if (BE (pstr->trans != NULL, 0))
249 wc = pstr->trans[wc];
250 pstr->cur_state = prev_st;
251 }
252 else if (BE (mbclen == (size_t) -2, 0))
253 {
254 /* The buffer doesn't have enough space, finish to build. */
255 pstr->cur_state = prev_st;
256 break;
257 }
258
259 /* Write wide character and padding. */
260 pstr->wcs[byte_idx++] = wc;
261 /* Write paddings. */
262 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
263 pstr->wcs[byte_idx++] = WEOF;
264 }
265 pstr->valid_len = byte_idx;
266 pstr->valid_raw_len = byte_idx;
267}
268
269/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
270 but for REG_ICASE. */
271
272static reg_errcode_t
273internal_function __attribute_warn_unused_result__
274build_wcs_upper_buffer (re_string_t *pstr)
275{
276 mbstate_t prev_st;
277 Idx src_idx, byte_idx, end_idx, remain_len;
278 size_t mbclen;
279#ifdef _LIBC
280 char buf[MB_LEN_MAX];
281 assert (MB_LEN_MAX >= pstr->mb_cur_max);
282#else
283 char buf[64];
284#endif
285
286 byte_idx = pstr->valid_len;
287 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
288
289 /* The following optimization assumes that ASCII characters can be
290 mapped to wide characters with a simple cast. */
291 if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
292 {
293 while (byte_idx < end_idx)
294 {
295 wchar_t wc;
296
297 if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
298 && mbsinit (&pstr->cur_state))
299 {
300 /* In case of a singlebyte character. */
301 pstr->mbs[byte_idx]
302 = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
303 /* The next step uses the assumption that wchar_t is encoded
304 ASCII-safe: all ASCII values can be converted like this. */
305 pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
306 ++byte_idx;
307 continue;
308 }
309
310 remain_len = end_idx - byte_idx;
311 prev_st = pstr->cur_state;
312 mbclen = __mbrtowc (&wc,
313 ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
314 + byte_idx), remain_len, &pstr->cur_state);
315 if (BE (mbclen < (size_t) -2, 1))
316 {
317 wchar_t wcu = __towupper (wc);
318 if (wcu != wc)
319 {
320 size_t mbcdlen;
321
322 mbcdlen = __wcrtomb (buf, wcu, &prev_st);
323 if (BE (mbclen == mbcdlen, 1))
324 memcpy (pstr->mbs + byte_idx, buf, mbclen);
325 else
326 {
327 src_idx = byte_idx;
328 goto offsets_needed;
329 }
330 }
331 else
332 memcpy (pstr->mbs + byte_idx,
333 pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
334 pstr->wcs[byte_idx++] = wcu;
335 /* Write paddings. */
336 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
337 pstr->wcs[byte_idx++] = WEOF;
338 }
339 else if (mbclen == (size_t) -1 || mbclen == 0
340 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
341 {
342 /* It is an invalid character, an incomplete character
343 at the end of the string, or '\0'. Just use the byte. */
344 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
345 pstr->mbs[byte_idx] = ch;
346 /* And also cast it to wide char. */
347 pstr->wcs[byte_idx++] = (wchar_t) ch;
348 if (BE (mbclen == (size_t) -1, 0))
349 pstr->cur_state = prev_st;
350 }
351 else
352 {
353 /* The buffer doesn't have enough space, finish to build. */
354 pstr->cur_state = prev_st;
355 break;
356 }
357 }
358 pstr->valid_len = byte_idx;
359 pstr->valid_raw_len = byte_idx;
360 return REG_NOERROR;
361 }
362 else
363 for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
364 {
365 wchar_t wc;
366 const char *p;
367 offsets_needed:
368 remain_len = end_idx - byte_idx;
369 prev_st = pstr->cur_state;
370 if (BE (pstr->trans != NULL, 0))
371 {
372 int i, ch;
373
374 for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
375 {
376 ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
377 buf[i] = pstr->trans[ch];
378 }
379 p = (const char *) buf;
380 }
381 else
382 p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
383 mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
384 if (BE (mbclen < (size_t) -2, 1))
385 {
386 wchar_t wcu = __towupper (wc);
387 if (wcu != wc)
388 {
389 size_t mbcdlen;
390
391 mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
392 if (BE (mbclen == mbcdlen, 1))
393 memcpy (pstr->mbs + byte_idx, buf, mbclen);
394 else if (mbcdlen != (size_t) -1)
395 {
396 size_t i;
397
398 if (byte_idx + mbcdlen > pstr->bufs_len)
399 {
400 pstr->cur_state = prev_st;
401 break;
402 }
403
404 if (pstr->offsets == NULL)
405 {
406 pstr->offsets = re_malloc (Idx, pstr->bufs_len);
407
408 if (pstr->offsets == NULL)
409 return REG_ESPACE;
410 }
411 if (!pstr->offsets_needed)
412 {
413 for (i = 0; i < (size_t) byte_idx; ++i)
414 pstr->offsets[i] = i;
415 pstr->offsets_needed = 1;
416 }
417
418 memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
419 pstr->wcs[byte_idx] = wcu;
420 pstr->offsets[byte_idx] = src_idx;
421 for (i = 1; i < mbcdlen; ++i)
422 {
423 pstr->offsets[byte_idx + i]
424 = src_idx + (i < mbclen ? i : mbclen - 1);
425 pstr->wcs[byte_idx + i] = WEOF;
426 }
427 pstr->len += mbcdlen - mbclen;
428 if (pstr->raw_stop > src_idx)
429 pstr->stop += mbcdlen - mbclen;
430 end_idx = (pstr->bufs_len > pstr->len)
431 ? pstr->len : pstr->bufs_len;
432 byte_idx += mbcdlen;
433 src_idx += mbclen;
434 continue;
435 }
436 else
437 memcpy (pstr->mbs + byte_idx, p, mbclen);
438 }
439 else
440 memcpy (pstr->mbs + byte_idx, p, mbclen);
441
442 if (BE (pstr->offsets_needed != 0, 0))
443 {
444 size_t i;
445 for (i = 0; i < mbclen; ++i)
446 pstr->offsets[byte_idx + i] = src_idx + i;
447 }
448 src_idx += mbclen;
449
450 pstr->wcs[byte_idx++] = wcu;
451 /* Write paddings. */
452 for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
453 pstr->wcs[byte_idx++] = WEOF;
454 }
455 else if (mbclen == (size_t) -1 || mbclen == 0
456 || (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
457 {
458 /* It is an invalid character or '\0'. Just use the byte. */
459 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
460
461 if (BE (pstr->trans != NULL, 0))
462 ch = pstr->trans [ch];
463 pstr->mbs[byte_idx] = ch;
464
465 if (BE (pstr->offsets_needed != 0, 0))
466 pstr->offsets[byte_idx] = src_idx;
467 ++src_idx;
468
469 /* And also cast it to wide char. */
470 pstr->wcs[byte_idx++] = (wchar_t) ch;
471 if (BE (mbclen == (size_t) -1, 0))
472 pstr->cur_state = prev_st;
473 }
474 else
475 {
476 /* The buffer doesn't have enough space, finish to build. */
477 pstr->cur_state = prev_st;
478 break;
479 }
480 }
481 pstr->valid_len = byte_idx;
482 pstr->valid_raw_len = src_idx;
483 return REG_NOERROR;
484}
485
486/* Skip characters until the index becomes greater than NEW_RAW_IDX.
487 Return the index. */
488
489static Idx
490internal_function
491re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
492{
493 mbstate_t prev_st;
494 Idx rawbuf_idx;
495 size_t mbclen;
496 wint_t wc = WEOF;
497
498 /* Skip the characters which are not necessary to check. */
499 for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
500 rawbuf_idx < new_raw_idx;)
501 {
502 wchar_t wc2;
503 Idx remain_len = pstr->raw_len - rawbuf_idx;
504 prev_st = pstr->cur_state;
505 mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
506 remain_len, &pstr->cur_state);
507 if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
508 {
509 /* We treat these cases as a single byte character. */
510 if (mbclen == 0 || remain_len == 0)
511 wc = L'\0';
512 else
513 wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
514 mbclen = 1;
515 pstr->cur_state = prev_st;
516 }
517 else
518 wc = wc2;
519 /* Then proceed the next character. */
520 rawbuf_idx += mbclen;
521 }
522 *last_wc = wc;
523 return rawbuf_idx;
524}
525#endif /* RE_ENABLE_I18N */
526
527/* Build the buffer PSTR->MBS, and apply the translation if we need.
528 This function is used in case of REG_ICASE. */
529
530static void
531internal_function
532build_upper_buffer (re_string_t *pstr)
533{
534 Idx char_idx, end_idx;
535 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
536
537 for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
538 {
539 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
540 if (BE (pstr->trans != NULL, 0))
541 ch = pstr->trans[ch];
542 pstr->mbs[char_idx] = toupper (ch);
543 }
544 pstr->valid_len = char_idx;
545 pstr->valid_raw_len = char_idx;
546}
547
548/* Apply TRANS to the buffer in PSTR. */
549
550static void
551internal_function
552re_string_translate_buffer (re_string_t *pstr)
553{
554 Idx buf_idx, end_idx;
555 end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
556
557 for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
558 {
559 int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
560 pstr->mbs[buf_idx] = pstr->trans[ch];
561 }
562
563 pstr->valid_len = buf_idx;
564 pstr->valid_raw_len = buf_idx;
565}
566
567/* This function re-construct the buffers.
568 Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
569 convert to upper case in case of REG_ICASE, apply translation. */
570
571static reg_errcode_t
572internal_function __attribute_warn_unused_result__
573re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
574{
575 Idx offset;
576
577 if (BE (pstr->raw_mbs_idx <= idx, 0))
578 offset = idx - pstr->raw_mbs_idx;
579 else
580 {
581 /* Reset buffer. */
582#ifdef RE_ENABLE_I18N
583 if (pstr->mb_cur_max > 1)
584 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
585#endif /* RE_ENABLE_I18N */
586 pstr->len = pstr->raw_len;
587 pstr->stop = pstr->raw_stop;
588 pstr->valid_len = 0;
589 pstr->raw_mbs_idx = 0;
590 pstr->valid_raw_len = 0;
591 pstr->offsets_needed = 0;
592 pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
593 : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
594 if (!pstr->mbs_allocated)
595 pstr->mbs = (unsigned char *) pstr->raw_mbs;
596 offset = idx;
597 }
598
599 if (BE (offset != 0, 1))
600 {
601 /* Should the already checked characters be kept? */
602 if (BE (offset < pstr->valid_raw_len, 1))
603 {
604 /* Yes, move them to the front of the buffer. */
605#ifdef RE_ENABLE_I18N
606 if (BE (pstr->offsets_needed, 0))
607 {
608 Idx low = 0, high = pstr->valid_len, mid;
609 do
610 {
611 mid = (high + low) / 2;
612 if (pstr->offsets[mid] > offset)
613 high = mid;
614 else if (pstr->offsets[mid] < offset)
615 low = mid + 1;
616 else
617 break;
618 }
619 while (low < high);
620 if (pstr->offsets[mid] < offset)
621 ++mid;
622 pstr->tip_context = re_string_context_at (pstr, mid - 1,
623 eflags);
624 /* This can be quite complicated, so handle specially
625 only the common and easy case where the character with
626 different length representation of lower and upper
627 case is present at or after offset. */
628 if (pstr->valid_len > offset
629 && mid == offset && pstr->offsets[mid] == offset)
630 {
631 memmove (pstr->wcs, pstr->wcs + offset,
632 (pstr->valid_len - offset) * sizeof (wint_t));
633 memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
634 pstr->valid_len -= offset;
635 pstr->valid_raw_len -= offset;
636 for (low = 0; low < pstr->valid_len; low++)
637 pstr->offsets[low] = pstr->offsets[low + offset] - offset;
638 }
639 else
640 {
641 /* Otherwise, just find out how long the partial multibyte
642 character at offset is and fill it with WEOF/255. */
643 pstr->len = pstr->raw_len - idx + offset;
644 pstr->stop = pstr->raw_stop - idx + offset;
645 pstr->offsets_needed = 0;
646 while (mid > 0 && pstr->offsets[mid - 1] == offset)
647 --mid;
648 while (mid < pstr->valid_len)
649 if (pstr->wcs[mid] != WEOF)
650 break;
651 else
652 ++mid;
653 if (mid == pstr->valid_len)
654 pstr->valid_len = 0;
655 else
656 {
657 pstr->valid_len = pstr->offsets[mid] - offset;
658 if (pstr->valid_len)
659 {
660 for (low = 0; low < pstr->valid_len; ++low)
661 pstr->wcs[low] = WEOF;
662 memset (pstr->mbs, 255, pstr->valid_len);
663 }
664 }
665 pstr->valid_raw_len = pstr->valid_len;
666 }
667 }
668 else
669#endif
670 {
671 pstr->tip_context = re_string_context_at (pstr, offset - 1,
672 eflags);
673#ifdef RE_ENABLE_I18N
674 if (pstr->mb_cur_max > 1)
675 memmove (pstr->wcs, pstr->wcs + offset,
676 (pstr->valid_len - offset) * sizeof (wint_t));
677#endif /* RE_ENABLE_I18N */
678 if (BE (pstr->mbs_allocated, 0))
679 memmove (pstr->mbs, pstr->mbs + offset,
680 pstr->valid_len - offset);
681 pstr->valid_len -= offset;
682 pstr->valid_raw_len -= offset;
683#if defined DEBUG && DEBUG
684 assert (pstr->valid_len > 0);
685#endif
686 }
687 }
688 else
689 {
690#ifdef RE_ENABLE_I18N
691 /* No, skip all characters until IDX. */
692 Idx prev_valid_len = pstr->valid_len;
693
694 if (BE (pstr->offsets_needed, 0))
695 {
696 pstr->len = pstr->raw_len - idx + offset;
697 pstr->stop = pstr->raw_stop - idx + offset;
698 pstr->offsets_needed = 0;
699 }
700#endif
701 pstr->valid_len = 0;
702#ifdef RE_ENABLE_I18N
703 if (pstr->mb_cur_max > 1)
704 {
705 Idx wcs_idx;
706 wint_t wc = WEOF;
707
708 if (pstr->is_utf8)
709 {
710 const unsigned char *raw, *p, *end;
711
712 /* Special case UTF-8. Multi-byte chars start with any
713 byte other than 0x80 - 0xbf. */
714 raw = pstr->raw_mbs + pstr->raw_mbs_idx;
715 end = raw + (offset - pstr->mb_cur_max);
716 if (end < pstr->raw_mbs)
717 end = pstr->raw_mbs;
718 p = raw + offset - 1;
719#ifdef _LIBC
720 /* We know the wchar_t encoding is UCS4, so for the simple
721 case, ASCII characters, skip the conversion step. */
722 if (isascii (*p) && BE (pstr->trans == NULL, 1))
723 {
724 memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
725 /* pstr->valid_len = 0; */
726 wc = (wchar_t) *p;
727 }
728 else
729#endif
730 for (; p >= end; --p)
731 if ((*p & 0xc0) != 0x80)
732 {
733 mbstate_t cur_state;
734 wchar_t wc2;
735 Idx mlen = raw + pstr->len - p;
736 unsigned char buf[6];
737 size_t mbclen;
738
739 const unsigned char *pp = p;
740 if (BE (pstr->trans != NULL, 0))
741 {
742 int i = mlen < 6 ? mlen : 6;
743 while (--i >= 0)
744 buf[i] = pstr->trans[p[i]];
745 pp = buf;
746 }
747 /* XXX Don't use mbrtowc, we know which conversion
748 to use (UTF-8 -> UCS4). */
749 memset (&cur_state, 0, sizeof (cur_state));
750 mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
751 &cur_state);
752 if (raw + offset - p <= mbclen
753 && mbclen < (size_t) -2)
754 {
755 memset (&pstr->cur_state, '\0',
756 sizeof (mbstate_t));
757 pstr->valid_len = mbclen - (raw + offset - p);
758 wc = wc2;
759 }
760 break;
761 }
762 }
763
764 if (wc == WEOF)
765 pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
766 if (wc == WEOF)
767 pstr->tip_context
768 = re_string_context_at (pstr, prev_valid_len - 1, eflags);
769 else
770 pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
771 && IS_WIDE_WORD_CHAR (wc))
772 ? CONTEXT_WORD
773 : ((IS_WIDE_NEWLINE (wc)
774 && pstr->newline_anchor)
775 ? CONTEXT_NEWLINE : 0));
776 if (BE (pstr->valid_len, 0))
777 {
778 for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
779 pstr->wcs[wcs_idx] = WEOF;
780 if (pstr->mbs_allocated)
781 memset (pstr->mbs, 255, pstr->valid_len);
782 }
783 pstr->valid_raw_len = pstr->valid_len;
784 }
785 else
786#endif /* RE_ENABLE_I18N */
787 {
788 int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
789 pstr->valid_raw_len = 0;
790 if (pstr->trans)
791 c = pstr->trans[c];
792 pstr->tip_context = (bitset_contain (pstr->word_char, c)
793 ? CONTEXT_WORD
794 : ((IS_NEWLINE (c) && pstr->newline_anchor)
795 ? CONTEXT_NEWLINE : 0));
796 }
797 }
798 if (!BE (pstr->mbs_allocated, 0))
799 pstr->mbs += offset;
800 }
801 pstr->raw_mbs_idx = idx;
802 pstr->len -= offset;
803 pstr->stop -= offset;
804
805 /* Then build the buffers. */
806#ifdef RE_ENABLE_I18N
807 if (pstr->mb_cur_max > 1)
808 {
809 if (pstr->icase)
810 {
811 reg_errcode_t ret = build_wcs_upper_buffer (pstr);
812 if (BE (ret != REG_NOERROR, 0))
813 return ret;
814 }
815 else
816 build_wcs_buffer (pstr);
817 }
818 else
819#endif /* RE_ENABLE_I18N */
820 if (BE (pstr->mbs_allocated, 0))
821 {
822 if (pstr->icase)
823 build_upper_buffer (pstr);
824 else if (pstr->trans != NULL)
825 re_string_translate_buffer (pstr);
826 }
827 else
828 pstr->valid_len = pstr->len;
829
830 pstr->cur_idx = 0;
831 return REG_NOERROR;
832}
833
834static unsigned char
835internal_function __attribute__ ((pure))
836re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
837{
838 int ch;
839 Idx off;
840
841 /* Handle the common (easiest) cases first. */
842 if (BE (!pstr->mbs_allocated, 1))
843 return re_string_peek_byte (pstr, idx);
844
845#ifdef RE_ENABLE_I18N
846 if (pstr->mb_cur_max > 1
847 && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
848 return re_string_peek_byte (pstr, idx);
849#endif
850
851 off = pstr->cur_idx + idx;
852#ifdef RE_ENABLE_I18N
853 if (pstr->offsets_needed)
854 off = pstr->offsets[off];
855#endif
856
857 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
858
859#ifdef RE_ENABLE_I18N
860 /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
861 this function returns CAPITAL LETTER I instead of first byte of
862 DOTLESS SMALL LETTER I. The latter would confuse the parser,
863 since peek_byte_case doesn't advance cur_idx in any way. */
864 if (pstr->offsets_needed && !isascii (ch))
865 return re_string_peek_byte (pstr, idx);
866#endif
867
868 return ch;
869}
870
871static unsigned char
872internal_function
873re_string_fetch_byte_case (re_string_t *pstr)
874{
875 if (BE (!pstr->mbs_allocated, 1))
876 return re_string_fetch_byte (pstr);
877
878#ifdef RE_ENABLE_I18N
879 if (pstr->offsets_needed)
880 {
881 Idx off;
882 int ch;
883
884 /* For tr_TR.UTF-8 [[:islower:]] there is
885 [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
886 in that case the whole multi-byte character and return
887 the original letter. On the other side, with
888 [[: DOTLESS SMALL LETTER I return [[:I, as doing
889 anything else would complicate things too much. */
890
891 if (!re_string_first_byte (pstr, pstr->cur_idx))
892 return re_string_fetch_byte (pstr);
893
894 off = pstr->offsets[pstr->cur_idx];
895 ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
896
897 if (! isascii (ch))
898 return re_string_fetch_byte (pstr);
899
900 re_string_skip_bytes (pstr,
901 re_string_char_size_at (pstr, pstr->cur_idx));
902 return ch;
903 }
904#endif
905
906 return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
907}
908
909static void
910internal_function
911re_string_destruct (re_string_t *pstr)
912{
913#ifdef RE_ENABLE_I18N
914 re_free (pstr->wcs);
915 re_free (pstr->offsets);
916#endif /* RE_ENABLE_I18N */
917 if (pstr->mbs_allocated)
918 re_free (pstr->mbs);
919}
920
921/* Return the context at IDX in INPUT. */
922
923static unsigned int
924internal_function
925re_string_context_at (const re_string_t *input, Idx idx, int eflags)
926{
927 int c;
928 if (BE (! REG_VALID_INDEX (idx), 0))
929 /* In this case, we use the value stored in input->tip_context,
930 since we can't know the character in input->mbs[-1] here. */
931 return input->tip_context;
932 if (BE (idx == input->len, 0))
933 return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
934 : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
935#ifdef RE_ENABLE_I18N
936 if (input->mb_cur_max > 1)
937 {
938 wint_t wc;
939 Idx wc_idx = idx;
940 while(input->wcs[wc_idx] == WEOF)
941 {
942#if defined DEBUG && DEBUG
943 /* It must not happen. */
944 assert (REG_VALID_INDEX (wc_idx));
945#endif
946 --wc_idx;
947 if (! REG_VALID_INDEX (wc_idx))
948 return input->tip_context;
949 }
950 wc = input->wcs[wc_idx];
951 if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
952 return CONTEXT_WORD;
953 return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
954 ? CONTEXT_NEWLINE : 0);
955 }
956 else
957#endif
958 {
959 c = re_string_byte_at (input, idx);
960 if (bitset_contain (input->word_char, c))
961 return CONTEXT_WORD;
962 return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
963 }
964}
965
966
967/* Functions for set operation. */
968
969static reg_errcode_t
970internal_function __attribute_warn_unused_result__
971re_node_set_alloc (re_node_set *set, Idx size)
972{
973 set->alloc = size;
974 set->nelem = 0;
975 set->elems = re_malloc (Idx, size);
976 if (BE (set->elems == NULL, 0) && (MALLOC_0_IS_NONNULL || size != 0))
977 return REG_ESPACE;
978 return REG_NOERROR;
979}
980
981static reg_errcode_t
982internal_function __attribute_warn_unused_result__
983re_node_set_init_1 (re_node_set *set, Idx elem)
984{
985 set->alloc = 1;
986 set->nelem = 1;
987 set->elems = re_malloc (Idx, 1);
988 if (BE (set->elems == NULL, 0))
989 {
990 set->alloc = set->nelem = 0;
991 return REG_ESPACE;
992 }
993 set->elems[0] = elem;
994 return REG_NOERROR;
995}
996
997static reg_errcode_t
998internal_function __attribute_warn_unused_result__
999re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1000{
1001 set->alloc = 2;
1002 set->elems = re_malloc (Idx, 2);
1003 if (BE (set->elems == NULL, 0))
1004 return REG_ESPACE;
1005 if (elem1 == elem2)
1006 {
1007 set->nelem = 1;
1008 set->elems[0] = elem1;
1009 }
1010 else
1011 {
1012 set->nelem = 2;
1013 if (elem1 < elem2)
1014 {
1015 set->elems[0] = elem1;
1016 set->elems[1] = elem2;
1017 }
1018 else
1019 {
1020 set->elems[0] = elem2;
1021 set->elems[1] = elem1;
1022 }
1023 }
1024 return REG_NOERROR;
1025}
1026
1027static reg_errcode_t
1028internal_function __attribute_warn_unused_result__
1029re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
1030{
1031 dest->nelem = src->nelem;
1032 if (src->nelem > 0)
1033 {
1034 dest->alloc = dest->nelem;
1035 dest->elems = re_malloc (Idx, dest->alloc);
1036 if (BE (dest->elems == NULL, 0))
1037 {
1038 dest->alloc = dest->nelem = 0;
1039 return REG_ESPACE;
1040 }
1041 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1042 }
1043 else
1044 re_node_set_init_empty (dest);
1045 return REG_NOERROR;
1046}
1047
1048/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1049 DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1050 Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1051
1052static reg_errcode_t
1053internal_function __attribute_warn_unused_result__
1054re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
1055 const re_node_set *src2)
1056{
1057 Idx i1, i2, is, id, delta, sbase;
1058 if (src1->nelem == 0 || src2->nelem == 0)
1059 return REG_NOERROR;
1060
1061 /* We need dest->nelem + 2 * elems_in_intersection; this is a
1062 conservative estimate. */
1063 if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1064 {
1065 Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1066 Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1067 if (BE (new_elems == NULL, 0))
1068 return REG_ESPACE;
1069 dest->elems = new_elems;
1070 dest->alloc = new_alloc;
1071 }
1072
1073 /* Find the items in the intersection of SRC1 and SRC2, and copy
1074 into the top of DEST those that are not already in DEST itself. */
1075 sbase = dest->nelem + src1->nelem + src2->nelem;
1076 i1 = src1->nelem - 1;
1077 i2 = src2->nelem - 1;
1078 id = dest->nelem - 1;
1079 for (;;)
1080 {
1081 if (src1->elems[i1] == src2->elems[i2])
1082 {
1083 /* Try to find the item in DEST. Maybe we could binary search? */
1084 while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1085 --id;
1086
1087 if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
1088 dest->elems[--sbase] = src1->elems[i1];
1089
1090 if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
1091 break;
1092 }
1093
1094 /* Lower the highest of the two items. */
1095 else if (src1->elems[i1] < src2->elems[i2])
1096 {
1097 if (! REG_VALID_INDEX (--i2))
1098 break;
1099 }
1100 else
1101 {
1102 if (! REG_VALID_INDEX (--i1))
1103 break;
1104 }
1105 }
1106
1107 id = dest->nelem - 1;
1108 is = dest->nelem + src1->nelem + src2->nelem - 1;
1109 delta = is - sbase + 1;
1110
1111 /* Now copy. When DELTA becomes zero, the remaining
1112 DEST elements are already in place; this is more or
1113 less the same loop that is in re_node_set_merge. */
1114 dest->nelem += delta;
1115 if (delta > 0 && REG_VALID_INDEX (id))
1116 for (;;)
1117 {
1118 if (dest->elems[is] > dest->elems[id])
1119 {
1120 /* Copy from the top. */
1121 dest->elems[id + delta--] = dest->elems[is--];
1122 if (delta == 0)
1123 break;
1124 }
1125 else
1126 {
1127 /* Slide from the bottom. */
1128 dest->elems[id + delta] = dest->elems[id];
1129 if (! REG_VALID_INDEX (--id))
1130 break;
1131 }
1132 }
1133
1134 /* Copy remaining SRC elements. */
1135 memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1136
1137 return REG_NOERROR;
1138}
1139
1140/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1141 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1142
1143static reg_errcode_t
1144internal_function __attribute_warn_unused_result__
1145re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
1146 const re_node_set *src2)
1147{
1148 Idx i1, i2, id;
1149 if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1150 {
1151 dest->alloc = src1->nelem + src2->nelem;
1152 dest->elems = re_malloc (Idx, dest->alloc);
1153 if (BE (dest->elems == NULL, 0))
1154 return REG_ESPACE;
1155 }
1156 else
1157 {
1158 if (src1 != NULL && src1->nelem > 0)
1159 return re_node_set_init_copy (dest, src1);
1160 else if (src2 != NULL && src2->nelem > 0)
1161 return re_node_set_init_copy (dest, src2);
1162 else
1163 re_node_set_init_empty (dest);
1164 return REG_NOERROR;
1165 }
1166 for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1167 {
1168 if (src1->elems[i1] > src2->elems[i2])
1169 {
1170 dest->elems[id++] = src2->elems[i2++];
1171 continue;
1172 }
1173 if (src1->elems[i1] == src2->elems[i2])
1174 ++i2;
1175 dest->elems[id++] = src1->elems[i1++];
1176 }
1177 if (i1 < src1->nelem)
1178 {
1179 memcpy (dest->elems + id, src1->elems + i1,
1180 (src1->nelem - i1) * sizeof (Idx));
1181 id += src1->nelem - i1;
1182 }
1183 else if (i2 < src2->nelem)
1184 {
1185 memcpy (dest->elems + id, src2->elems + i2,
1186 (src2->nelem - i2) * sizeof (Idx));
1187 id += src2->nelem - i2;
1188 }
1189 dest->nelem = id;
1190 return REG_NOERROR;
1191}
1192
1193/* Calculate the union set of the sets DEST and SRC. And store it to
1194 DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1195
1196static reg_errcode_t
1197internal_function __attribute_warn_unused_result__
1198re_node_set_merge (re_node_set *dest, const re_node_set *src)
1199{
1200 Idx is, id, sbase, delta;
1201 if (src == NULL || src->nelem == 0)
1202 return REG_NOERROR;
1203 if (dest->alloc < 2 * src->nelem + dest->nelem)
1204 {
1205 Idx new_alloc = 2 * (src->nelem + dest->alloc);
1206 Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1207 if (BE (new_buffer == NULL, 0))
1208 return REG_ESPACE;
1209 dest->elems = new_buffer;
1210 dest->alloc = new_alloc;
1211 }
1212
1213 if (BE (dest->nelem == 0, 0))
1214 {
1215 dest->nelem = src->nelem;
1216 memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1217 return REG_NOERROR;
1218 }
1219
1220 /* Copy into the top of DEST the items of SRC that are not
1221 found in DEST. Maybe we could binary search in DEST? */
1222 for (sbase = dest->nelem + 2 * src->nelem,
1223 is = src->nelem - 1, id = dest->nelem - 1;
1224 REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1225 {
1226 if (dest->elems[id] == src->elems[is])
1227 is--, id--;
1228 else if (dest->elems[id] < src->elems[is])
1229 dest->elems[--sbase] = src->elems[is--];
1230 else /* if (dest->elems[id] > src->elems[is]) */
1231 --id;
1232 }
1233
1234 if (REG_VALID_INDEX (is))
1235 {
1236 /* If DEST is exhausted, the remaining items of SRC must be unique. */
1237 sbase -= is + 1;
1238 memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1239 }
1240
1241 id = dest->nelem - 1;
1242 is = dest->nelem + 2 * src->nelem - 1;
1243 delta = is - sbase + 1;
1244 if (delta == 0)
1245 return REG_NOERROR;
1246
1247 /* Now copy. When DELTA becomes zero, the remaining
1248 DEST elements are already in place. */
1249 dest->nelem += delta;
1250 for (;;)
1251 {
1252 if (dest->elems[is] > dest->elems[id])
1253 {
1254 /* Copy from the top. */
1255 dest->elems[id + delta--] = dest->elems[is--];
1256 if (delta == 0)
1257 break;
1258 }
1259 else
1260 {
1261 /* Slide from the bottom. */
1262 dest->elems[id + delta] = dest->elems[id];
1263 if (! REG_VALID_INDEX (--id))
1264 {
1265 /* Copy remaining SRC elements. */
1266 memcpy (dest->elems, dest->elems + sbase,
1267 delta * sizeof (Idx));
1268 break;
1269 }
1270 }
1271 }
1272
1273 return REG_NOERROR;
1274}
1275
1276/* Insert the new element ELEM to the re_node_set* SET.
1277 SET should not already have ELEM.
1278 Return true if successful. */
1279
1280static bool
1281internal_function __attribute_warn_unused_result__
1282re_node_set_insert (re_node_set *set, Idx elem)
1283{
1284 Idx idx;
1285 /* In case the set is empty. */
1286 if (set->alloc == 0)
1287 return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1288
1289 if (BE (set->nelem, 0) == 0)
1290 {
1291 /* We already guaranteed above that set->alloc != 0. */
1292 set->elems[0] = elem;
1293 ++set->nelem;
1294 return true;
1295 }
1296
1297 /* Realloc if we need. */
1298 if (set->alloc == set->nelem)
1299 {
1300 Idx *new_elems;
1301 set->alloc = set->alloc * 2;
1302 new_elems = re_realloc (set->elems, Idx, set->alloc);
1303 if (BE (new_elems == NULL, 0))
1304 return false;
1305 set->elems = new_elems;
1306 }
1307
1308 /* Move the elements which follows the new element. Test the
1309 first element separately to skip a check in the inner loop. */
1310 if (elem < set->elems[0])
1311 {
1312 idx = 0;
1313 for (idx = set->nelem; idx > 0; idx--)
1314 set->elems[idx] = set->elems[idx - 1];
1315 }
1316 else
1317 {
1318 for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1319 set->elems[idx] = set->elems[idx - 1];
1320 }
1321
1322 /* Insert the new element. */
1323 set->elems[idx] = elem;
1324 ++set->nelem;
1325 return true;
1326}
1327
1328/* Insert the new element ELEM to the re_node_set* SET.
1329 SET should not already have any element greater than or equal to ELEM.
1330 Return true if successful. */
1331
1332static bool
1333internal_function __attribute_warn_unused_result__
1334re_node_set_insert_last (re_node_set *set, Idx elem)
1335{
1336 /* Realloc if we need. */
1337 if (set->alloc == set->nelem)
1338 {
1339 Idx *new_elems;
1340 set->alloc = (set->alloc + 1) * 2;
1341 new_elems = re_realloc (set->elems, Idx, set->alloc);
1342 if (BE (new_elems == NULL, 0))
1343 return false;
1344 set->elems = new_elems;
1345 }
1346
1347 /* Insert the new element. */
1348 set->elems[set->nelem++] = elem;
1349 return true;
1350}
1351
1352/* Compare two node sets SET1 and SET2.
1353 Return true if SET1 and SET2 are equivalent. */
1354
1355static bool
1356internal_function __attribute__ ((pure))
1357re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
1358{
1359 Idx i;
1360 if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
1361 return false;
1362 for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1363 if (set1->elems[i] != set2->elems[i])
1364 return false;
1365 return true;
1366}
1367
1368/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1369
1370static Idx
1371internal_function __attribute__ ((pure))
1372re_node_set_contains (const re_node_set *set, Idx elem)
1373{
1374 __re_size_t idx, right, mid;
1375 if (! REG_VALID_NONZERO_INDEX (set->nelem))
1376 return 0;
1377
1378 /* Binary search the element. */
1379 idx = 0;
1380 right = set->nelem - 1;
1381 while (idx < right)
1382 {
1383 mid = (idx + right) / 2;
1384 if (set->elems[mid] < elem)
1385 idx = mid + 1;
1386 else
1387 right = mid;
1388 }
1389 return set->elems[idx] == elem ? idx + 1 : 0;
1390}
1391
1392static void
1393internal_function
1394re_node_set_remove_at (re_node_set *set, Idx idx)
1395{
1396 verify (! TYPE_SIGNED (Idx));
1397 /* if (idx < 0)
1398 return; */
1399 if (idx >= set->nelem)
1400 return;
1401 --set->nelem;
1402 for (; idx < set->nelem; idx++)
1403 set->elems[idx] = set->elems[idx + 1];
1404}
1405
1406
1407
1408/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1409 Or return REG_MISSING if an error occurred. */
1410
1411static Idx
1412internal_function
1413re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1414{
1415 if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1416 {
1417 size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1418 Idx *new_nexts, *new_indices;
1419 re_node_set *new_edests, *new_eclosures;
1420 re_token_t *new_nodes;
1421
1422 /* Avoid overflows in realloc. */
1423 const size_t max_object_size = MAX (sizeof (re_token_t),
1424 MAX (sizeof (re_node_set),
1425 sizeof (Idx)));
1426 if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
1427 return REG_MISSING;
1428
1429 new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1430 if (BE (new_nodes == NULL, 0))
1431 return REG_MISSING;
1432 dfa->nodes = new_nodes;
1433 new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1434 new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1435 new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1436 new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1437 if (BE (new_nexts == NULL || new_indices == NULL
1438 || new_edests == NULL || new_eclosures == NULL, 0))
1439 return REG_MISSING;
1440 dfa->nexts = new_nexts;
1441 dfa->org_indices = new_indices;
1442 dfa->edests = new_edests;
1443 dfa->eclosures = new_eclosures;
1444 dfa->nodes_alloc = new_nodes_alloc;
1445 }
1446 dfa->nodes[dfa->nodes_len] = token;
1447 dfa->nodes[dfa->nodes_len].constraint = 0;
1448#ifdef RE_ENABLE_I18N
1449 dfa->nodes[dfa->nodes_len].accept_mb =
1450 ((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1451 || token.type == COMPLEX_BRACKET);
1452#endif
1453 dfa->nexts[dfa->nodes_len] = REG_MISSING;
1454 re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1455 re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1456 return dfa->nodes_len++;
1457}
1458
1459static re_hashval_t
1460internal_function
1461calc_state_hash (const re_node_set *nodes, unsigned int context)
1462{
1463 re_hashval_t hash = nodes->nelem + context;
1464 Idx i;
1465 for (i = 0 ; i < nodes->nelem ; i++)
1466 hash += nodes->elems[i];
1467 return hash;
1468}
1469
1470/* Search for the state whose node_set is equivalent to NODES.
1471 Return the pointer to the state, if we found it in the DFA.
1472 Otherwise create the new one and return it. In case of an error
1473 return NULL and set the error code in ERR.
1474 Note: - We assume NULL as the invalid state, then it is possible that
1475 return value is NULL and ERR is REG_NOERROR.
1476 - We never return non-NULL value in case of any errors, it is for
1477 optimization. */
1478
1479static re_dfastate_t *
1480internal_function __attribute_warn_unused_result__
1481re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
1482 const re_node_set *nodes)
1483{
1484 re_hashval_t hash;
1485 re_dfastate_t *new_state;
1486 struct re_state_table_entry *spot;
1487 Idx i;
1488#ifdef lint
1489 /* Suppress bogus uninitialized-variable warnings. */
1490 *err = REG_NOERROR;
1491#endif
1492 if (BE (nodes->nelem == 0, 0))
1493 {
1494 *err = REG_NOERROR;
1495 return NULL;
1496 }
1497 hash = calc_state_hash (nodes, 0);
1498 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1499
1500 for (i = 0 ; i < spot->num ; i++)
1501 {
1502 re_dfastate_t *state = spot->array[i];
1503 if (hash != state->hash)
1504 continue;
1505 if (re_node_set_compare (&state->nodes, nodes))
1506 return state;
1507 }
1508
1509 /* There are no appropriate state in the dfa, create the new one. */
1510 new_state = create_ci_newstate (dfa, nodes, hash);
1511 if (BE (new_state == NULL, 0))
1512 *err = REG_ESPACE;
1513
1514 return new_state;
1515}
1516
1517/* Search for the state whose node_set is equivalent to NODES and
1518 whose context is equivalent to CONTEXT.
1519 Return the pointer to the state, if we found it in the DFA.
1520 Otherwise create the new one and return it. In case of an error
1521 return NULL and set the error code in ERR.
1522 Note: - We assume NULL as the invalid state, then it is possible that
1523 return value is NULL and ERR is REG_NOERROR.
1524 - We never return non-NULL value in case of any errors, it is for
1525 optimization. */
1526
1527static re_dfastate_t *
1528internal_function __attribute_warn_unused_result__
1529re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
1530 const re_node_set *nodes, unsigned int context)
1531{
1532 re_hashval_t hash;
1533 re_dfastate_t *new_state;
1534 struct re_state_table_entry *spot;
1535 Idx i;
1536#ifdef lint
1537 /* Suppress bogus uninitialized-variable warnings. */
1538 *err = REG_NOERROR;
1539#endif
1540 if (nodes->nelem == 0)
1541 {
1542 *err = REG_NOERROR;
1543 return NULL;
1544 }
1545 hash = calc_state_hash (nodes, context);
1546 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1547
1548 for (i = 0 ; i < spot->num ; i++)
1549 {
1550 re_dfastate_t *state = spot->array[i];
1551 if (state->hash == hash
1552 && state->context == context
1553 && re_node_set_compare (state->entrance_nodes, nodes))
1554 return state;
1555 }
1556 /* There are no appropriate state in 'dfa', create the new one. */
1557 new_state = create_cd_newstate (dfa, nodes, context, hash);
1558 if (BE (new_state == NULL, 0))
1559 *err = REG_ESPACE;
1560
1561 return new_state;
1562}
1563
1564/* Finish initialization of the new state NEWSTATE, and using its hash value
1565 HASH put in the appropriate bucket of DFA's state table. Return value
1566 indicates the error code if failed. */
1567
1568static reg_errcode_t
1569__attribute_warn_unused_result__
1570register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
1571 re_hashval_t hash)
1572{
1573 struct re_state_table_entry *spot;
1574 reg_errcode_t err;
1575 Idx i;
1576
1577 newstate->hash = hash;
1578 err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1579 if (BE (err != REG_NOERROR, 0))
1580 return REG_ESPACE;
1581 for (i = 0; i < newstate->nodes.nelem; i++)
1582 {
1583 Idx elem = newstate->nodes.elems[i];
1584 if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1585 if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1586 return REG_ESPACE;
1587 }
1588
1589 spot = dfa->state_table + (hash & dfa->state_hash_mask);
1590 if (BE (spot->alloc <= spot->num, 0))
1591 {
1592 Idx new_alloc = 2 * spot->num + 2;
1593 re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
1594 new_alloc);
1595 if (BE (new_array == NULL, 0))
1596 return REG_ESPACE;
1597 spot->array = new_array;
1598 spot->alloc = new_alloc;
1599 }
1600 spot->array[spot->num++] = newstate;
1601 return REG_NOERROR;
1602}
1603
1604static void
1605free_state (re_dfastate_t *state)
1606{
1607 re_node_set_free (&state->non_eps_nodes);
1608 re_node_set_free (&state->inveclosure);
1609 if (state->entrance_nodes != &state->nodes)
1610 {
1611 re_node_set_free (state->entrance_nodes);
1612 re_free (state->entrance_nodes);
1613 }
1614 re_node_set_free (&state->nodes);
1615 re_free (state->word_trtable);
1616 re_free (state->trtable);
1617 re_free (state);
1618}
1619
1620/* Create the new state which is independent of contexts.
1621 Return the new state if succeeded, otherwise return NULL. */
1622
1623static re_dfastate_t *
1624internal_function __attribute_warn_unused_result__
1625create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1626 re_hashval_t hash)
1627{
1628 Idx i;
1629 reg_errcode_t err;
1630 re_dfastate_t *newstate;
1631
1632 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1633 if (BE (newstate == NULL, 0))
1634 return NULL;
1635 err = re_node_set_init_copy (&newstate->nodes, nodes);
1636 if (BE (err != REG_NOERROR, 0))
1637 {
1638 re_free (newstate);
1639 return NULL;
1640 }
1641
1642 newstate->entrance_nodes = &newstate->nodes;
1643 for (i = 0 ; i < nodes->nelem ; i++)
1644 {
1645 re_token_t *node = dfa->nodes + nodes->elems[i];
1646 re_token_type_t type = node->type;
1647 if (type == CHARACTER && !node->constraint)
1648 continue;
1649#ifdef RE_ENABLE_I18N
1650 newstate->accept_mb |= node->accept_mb;
1651#endif /* RE_ENABLE_I18N */
1652
1653 /* If the state has the halt node, the state is a halt state. */
1654 if (type == END_OF_RE)
1655 newstate->halt = 1;
1656 else if (type == OP_BACK_REF)
1657 newstate->has_backref = 1;
1658 else if (type == ANCHOR || node->constraint)
1659 newstate->has_constraint = 1;
1660 }
1661 err = register_state (dfa, newstate, hash);
1662 if (BE (err != REG_NOERROR, 0))
1663 {
1664 free_state (newstate);
1665 newstate = NULL;
1666 }
1667 return newstate;
1668}
1669
1670/* Create the new state which is depend on the context CONTEXT.
1671 Return the new state if succeeded, otherwise return NULL. */
1672
1673static re_dfastate_t *
1674internal_function __attribute_warn_unused_result__
1675create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
1676 unsigned int context, re_hashval_t hash)
1677{
1678 Idx i, nctx_nodes = 0;
1679 reg_errcode_t err;
1680 re_dfastate_t *newstate;
1681
1682 newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1683 if (BE (newstate == NULL, 0))
1684 return NULL;
1685 err = re_node_set_init_copy (&newstate->nodes, nodes);
1686 if (BE (err != REG_NOERROR, 0))
1687 {
1688 re_free (newstate);
1689 return NULL;
1690 }
1691
1692 newstate->context = context;
1693 newstate->entrance_nodes = &newstate->nodes;
1694
1695 for (i = 0 ; i < nodes->nelem ; i++)
1696 {
1697 re_token_t *node = dfa->nodes + nodes->elems[i];
1698 re_token_type_t type = node->type;
1699 unsigned int constraint = node->constraint;
1700
1701 if (type == CHARACTER && !constraint)
1702 continue;
1703#ifdef RE_ENABLE_I18N
1704 newstate->accept_mb |= node->accept_mb;
1705#endif /* RE_ENABLE_I18N */
1706
1707 /* If the state has the halt node, the state is a halt state. */
1708 if (type == END_OF_RE)
1709 newstate->halt = 1;
1710 else if (type == OP_BACK_REF)
1711 newstate->has_backref = 1;
1712
1713 if (constraint)
1714 {
1715 if (newstate->entrance_nodes == &newstate->nodes)
1716 {
1717 newstate->entrance_nodes = re_malloc (re_node_set, 1);
1718 if (BE (newstate->entrance_nodes == NULL, 0))
1719 {
1720 free_state (newstate);
1721 return NULL;
1722 }
1723 if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1724 != REG_NOERROR)
1725 return NULL;
1726 nctx_nodes = 0;
1727 newstate->has_constraint = 1;
1728 }
1729
1730 if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1731 {
1732 re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1733 ++nctx_nodes;
1734 }
1735 }
1736 }
1737 err = register_state (dfa, newstate, hash);
1738 if (BE (err != REG_NOERROR, 0))
1739 {
1740 free_state (newstate);
1741 newstate = NULL;
1742 }
1743 return newstate;
1744}
Note: See TracBrowser for help on using the repository browser.