Context Navigation

source: coreutils/trunk/lib/regex_internal.c@ 1648

Last change on this file since 1648 was 1648, checked in by Silvan Scherrer, 9 years ago
coreutils: update trunk to version 8.25
File size: 47.6 KB

Line
1	/* Extended regular expression matching and search library.
2	Copyright (C) 2002-2016 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU General Public
8	License as published by the Free Software Foundation; either
9	version 3 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	General Public License for more details.
15
16	You should have received a copy of the GNU General Public
17	License along with the GNU C Library; if not, see
18	<http://www.gnu.org/licenses/>. */
19
20	#include "verify.h"
21	#include "intprops.h"
22	static void re_string_construct_common (const char *str, Idx len,
23	re_string_t *pstr,
24	RE_TRANSLATE_TYPE trans, bool icase,
25	const re_dfa_t *dfa) internal_function;
26	static re_dfastate_t create_ci_newstate (const re_dfa_t dfa,
27	const re_node_set *nodes,
28	re_hashval_t hash) internal_function;
29	static re_dfastate_t create_cd_newstate (const re_dfa_t dfa,
30	const re_node_set *nodes,
31	unsigned int context,
32	re_hashval_t hash) internal_function;
33
34
35	/* Functions for string operation. */
36
37	/* This function allocate the buffers. It is necessary to call
38	re_string_reconstruct before using the object. */
39
40	static reg_errcode_t
41	internal_function __attribute_warn_unused_result__
42	re_string_allocate (re_string_t pstr, const char str, Idx len, Idx init_len,
43	RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
44	{
45	reg_errcode_t ret;
46	Idx init_buf_len;
47
48	/* Ensure at least one character fits into the buffers. */
49	if (init_len < dfa->mb_cur_max)
50	init_len = dfa->mb_cur_max;
51	init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
52	re_string_construct_common (str, len, pstr, trans, icase, dfa);
53
54	ret = re_string_realloc_buffers (pstr, init_buf_len);
55	if (BE (ret != REG_NOERROR, 0))
56	return ret;
57
58	pstr->word_char = dfa->word_char;
59	pstr->word_ops_used = dfa->word_ops_used;
60	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
61	pstr->valid_len = (pstr->mbs_allocated \|\| dfa->mb_cur_max > 1) ? 0 : len;
62	pstr->valid_raw_len = pstr->valid_len;
63	return REG_NOERROR;
64	}
65
66	/* This function allocate the buffers, and initialize them. */
67
68	static reg_errcode_t
69	internal_function __attribute_warn_unused_result__
70	re_string_construct (re_string_t pstr, const char str, Idx len,
71	RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
72	{
73	reg_errcode_t ret;
74	memset (pstr, '\0', sizeof (re_string_t));
75	re_string_construct_common (str, len, pstr, trans, icase, dfa);
76
77	if (len > 0)
78	{
79	ret = re_string_realloc_buffers (pstr, len + 1);
80	if (BE (ret != REG_NOERROR, 0))
81	return ret;
82	}
83	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
84
85	if (icase)
86	{
87	#ifdef RE_ENABLE_I18N
88	if (dfa->mb_cur_max > 1)
89	{
90	while (1)
91	{
92	ret = build_wcs_upper_buffer (pstr);
93	if (BE (ret != REG_NOERROR, 0))
94	return ret;
95	if (pstr->valid_raw_len >= len)
96	break;
97	if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
98	break;
99	ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
100	if (BE (ret != REG_NOERROR, 0))
101	return ret;
102	}
103	}
104	else
105	#endif /* RE_ENABLE_I18N */
106	build_upper_buffer (pstr);
107	}
108	else
109	{
110	#ifdef RE_ENABLE_I18N
111	if (dfa->mb_cur_max > 1)
112	build_wcs_buffer (pstr);
113	else
114	#endif /* RE_ENABLE_I18N */
115	{
116	if (trans != NULL)
117	re_string_translate_buffer (pstr);
118	else
119	{
120	pstr->valid_len = pstr->bufs_len;
121	pstr->valid_raw_len = pstr->bufs_len;
122	}
123	}
124	}
125
126	return REG_NOERROR;
127	}
128
129	/* Helper functions for re_string_allocate, and re_string_construct. */
130
131	static reg_errcode_t
132	internal_function __attribute_warn_unused_result__
133	re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
134	{
135	#ifdef RE_ENABLE_I18N
136	if (pstr->mb_cur_max > 1)
137	{
138	wint_t *new_wcs;
139
140	/* Avoid overflow in realloc. */
141	const size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
142	if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_buf_len, 0))
143	return REG_ESPACE;
144
145	new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
146	if (BE (new_wcs == NULL, 0))
147	return REG_ESPACE;
148	pstr->wcs = new_wcs;
149	if (pstr->offsets != NULL)
150	{
151	Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
152	if (BE (new_offsets == NULL, 0))
153	return REG_ESPACE;
154	pstr->offsets = new_offsets;
155	}
156	}
157	#endif /* RE_ENABLE_I18N */
158	if (pstr->mbs_allocated)
159	{
160	unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
161	new_buf_len);
162	if (BE (new_mbs == NULL, 0))
163	return REG_ESPACE;
164	pstr->mbs = new_mbs;
165	}
166	pstr->bufs_len = new_buf_len;
167	return REG_NOERROR;
168	}
169
170
171	static void
172	internal_function
173	re_string_construct_common (const char str, Idx len, re_string_t pstr,
174	RE_TRANSLATE_TYPE trans, bool icase,
175	const re_dfa_t *dfa)
176	{
177	pstr->raw_mbs = (const unsigned char *) str;
178	pstr->len = len;
179	pstr->raw_len = len;
180	pstr->trans = trans;
181	pstr->icase = icase;
182	pstr->mbs_allocated = (trans != NULL \|\| icase);
183	pstr->mb_cur_max = dfa->mb_cur_max;
184	pstr->is_utf8 = dfa->is_utf8;
185	pstr->map_notascii = dfa->map_notascii;
186	pstr->stop = pstr->len;
187	pstr->raw_stop = pstr->stop;
188	}
189
190	#ifdef RE_ENABLE_I18N
191
192	/* Build wide character buffer PSTR->WCS.
193	If the byte sequence of the string are:
194	<mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
195	Then wide character buffer will be:
196	<wc1> , WEOF , <wc2> , WEOF , <wc3>
197	We use WEOF for padding, they indicate that the position isn't
198	a first byte of a multibyte character.
199
200	Note that this function assumes PSTR->VALID_LEN elements are already
201	built and starts from PSTR->VALID_LEN. */
202
203	static void
204	internal_function
205	build_wcs_buffer (re_string_t *pstr)
206	{
207	#ifdef _LIBC
208	unsigned char buf[MB_LEN_MAX];
209	assert (MB_LEN_MAX >= pstr->mb_cur_max);
210	#else
211	unsigned char buf[64];
212	#endif
213	mbstate_t prev_st;
214	Idx byte_idx, end_idx, remain_len;
215	size_t mbclen;
216
217	/* Build the buffers from pstr->valid_len to either pstr->len or
218	pstr->bufs_len. */
219	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
220	for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
221	{
222	wchar_t wc;
223	const char *p;
224
225	remain_len = end_idx - byte_idx;
226	prev_st = pstr->cur_state;
227	/* Apply the translation if we need. */
228	if (BE (pstr->trans != NULL, 0))
229	{
230	int i, ch;
231
232	for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
233	{
234	ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
235	buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
236	}
237	p = (const char *) buf;
238	}
239	else
240	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
241	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
242	if (BE (mbclen == (size_t) -1 \|\| mbclen == 0
243	\|\| (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len), 0))
244	{
245	/* We treat these cases as a singlebyte character. */
246	mbclen = 1;
247	wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
248	if (BE (pstr->trans != NULL, 0))
249	wc = pstr->trans[wc];
250	pstr->cur_state = prev_st;
251	}
252	else if (BE (mbclen == (size_t) -2, 0))
253	{
254	/* The buffer doesn't have enough space, finish to build. */
255	pstr->cur_state = prev_st;
256	break;
257	}
258
259	/* Write wide character and padding. */
260	pstr->wcs[byte_idx++] = wc;
261	/* Write paddings. */
262	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
263	pstr->wcs[byte_idx++] = WEOF;
264	}
265	pstr->valid_len = byte_idx;
266	pstr->valid_raw_len = byte_idx;
267	}
268
269	/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
270	but for REG_ICASE. */
271
272	static reg_errcode_t
273	internal_function __attribute_warn_unused_result__
274	build_wcs_upper_buffer (re_string_t *pstr)
275	{
276	mbstate_t prev_st;
277	Idx src_idx, byte_idx, end_idx, remain_len;
278	size_t mbclen;
279	#ifdef _LIBC
280	char buf[MB_LEN_MAX];
281	assert (MB_LEN_MAX >= pstr->mb_cur_max);
282	#else
283	char buf[64];
284	#endif
285
286	byte_idx = pstr->valid_len;
287	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
288
289	/* The following optimization assumes that ASCII characters can be
290	mapped to wide characters with a simple cast. */
291	if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
292	{
293	while (byte_idx < end_idx)
294	{
295	wchar_t wc;
296
297	if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
298	&& mbsinit (&pstr->cur_state))
299	{
300	/* In case of a singlebyte character. */
301	pstr->mbs[byte_idx]
302	= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
303	/* The next step uses the assumption that wchar_t is encoded
304	ASCII-safe: all ASCII values can be converted like this. */
305	pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
306	++byte_idx;
307	continue;
308	}
309
310	remain_len = end_idx - byte_idx;
311	prev_st = pstr->cur_state;
312	mbclen = __mbrtowc (&wc,
313	((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
314	+ byte_idx), remain_len, &pstr->cur_state);
315	if (BE (mbclen < (size_t) -2, 1))
316	{
317	wchar_t wcu = __towupper (wc);
318	if (wcu != wc)
319	{
320	size_t mbcdlen;
321
322	mbcdlen = __wcrtomb (buf, wcu, &prev_st);
323	if (BE (mbclen == mbcdlen, 1))
324	memcpy (pstr->mbs + byte_idx, buf, mbclen);
325	else
326	{
327	src_idx = byte_idx;
328	goto offsets_needed;
329	}
330	}
331	else
332	memcpy (pstr->mbs + byte_idx,
333	pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
334	pstr->wcs[byte_idx++] = wcu;
335	/* Write paddings. */
336	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
337	pstr->wcs[byte_idx++] = WEOF;
338	}
339	else if (mbclen == (size_t) -1 \|\| mbclen == 0
340	\|\| (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
341	{
342	/* It is an invalid character, an incomplete character
343	at the end of the string, or '\0'. Just use the byte. */
344	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
345	pstr->mbs[byte_idx] = ch;
346	/* And also cast it to wide char. */
347	pstr->wcs[byte_idx++] = (wchar_t) ch;
348	if (BE (mbclen == (size_t) -1, 0))
349	pstr->cur_state = prev_st;
350	}
351	else
352	{
353	/* The buffer doesn't have enough space, finish to build. */
354	pstr->cur_state = prev_st;
355	break;
356	}
357	}
358	pstr->valid_len = byte_idx;
359	pstr->valid_raw_len = byte_idx;
360	return REG_NOERROR;
361	}
362	else
363	for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
364	{
365	wchar_t wc;
366	const char *p;
367	offsets_needed:
368	remain_len = end_idx - byte_idx;
369	prev_st = pstr->cur_state;
370	if (BE (pstr->trans != NULL, 0))
371	{
372	int i, ch;
373
374	for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
375	{
376	ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
377	buf[i] = pstr->trans[ch];
378	}
379	p = (const char *) buf;
380	}
381	else
382	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
383	mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
384	if (BE (mbclen < (size_t) -2, 1))
385	{
386	wchar_t wcu = __towupper (wc);
387	if (wcu != wc)
388	{
389	size_t mbcdlen;
390
391	mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
392	if (BE (mbclen == mbcdlen, 1))
393	memcpy (pstr->mbs + byte_idx, buf, mbclen);
394	else if (mbcdlen != (size_t) -1)
395	{
396	size_t i;
397
398	if (byte_idx + mbcdlen > pstr->bufs_len)
399	{
400	pstr->cur_state = prev_st;
401	break;
402	}
403
404	if (pstr->offsets == NULL)
405	{
406	pstr->offsets = re_malloc (Idx, pstr->bufs_len);
407
408	if (pstr->offsets == NULL)
409	return REG_ESPACE;
410	}
411	if (!pstr->offsets_needed)
412	{
413	for (i = 0; i < (size_t) byte_idx; ++i)
414	pstr->offsets[i] = i;
415	pstr->offsets_needed = 1;
416	}
417
418	memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
419	pstr->wcs[byte_idx] = wcu;
420	pstr->offsets[byte_idx] = src_idx;
421	for (i = 1; i < mbcdlen; ++i)
422	{
423	pstr->offsets[byte_idx + i]
424	= src_idx + (i < mbclen ? i : mbclen - 1);
425	pstr->wcs[byte_idx + i] = WEOF;
426	}
427	pstr->len += mbcdlen - mbclen;
428	if (pstr->raw_stop > src_idx)
429	pstr->stop += mbcdlen - mbclen;
430	end_idx = (pstr->bufs_len > pstr->len)
431	? pstr->len : pstr->bufs_len;
432	byte_idx += mbcdlen;
433	src_idx += mbclen;
434	continue;
435	}
436	else
437	memcpy (pstr->mbs + byte_idx, p, mbclen);
438	}
439	else
440	memcpy (pstr->mbs + byte_idx, p, mbclen);
441
442	if (BE (pstr->offsets_needed != 0, 0))
443	{
444	size_t i;
445	for (i = 0; i < mbclen; ++i)
446	pstr->offsets[byte_idx + i] = src_idx + i;
447	}
448	src_idx += mbclen;
449
450	pstr->wcs[byte_idx++] = wcu;
451	/* Write paddings. */
452	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
453	pstr->wcs[byte_idx++] = WEOF;
454	}
455	else if (mbclen == (size_t) -1 \|\| mbclen == 0
456	\|\| (mbclen == (size_t) -2 && pstr->bufs_len >= pstr->len))
457	{
458	/* It is an invalid character or '\0'. Just use the byte. */
459	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
460
461	if (BE (pstr->trans != NULL, 0))
462	ch = pstr->trans [ch];
463	pstr->mbs[byte_idx] = ch;
464
465	if (BE (pstr->offsets_needed != 0, 0))
466	pstr->offsets[byte_idx] = src_idx;
467	++src_idx;
468
469	/* And also cast it to wide char. */
470	pstr->wcs[byte_idx++] = (wchar_t) ch;
471	if (BE (mbclen == (size_t) -1, 0))
472	pstr->cur_state = prev_st;
473	}
474	else
475	{
476	/* The buffer doesn't have enough space, finish to build. */
477	pstr->cur_state = prev_st;
478	break;
479	}
480	}
481	pstr->valid_len = byte_idx;
482	pstr->valid_raw_len = src_idx;
483	return REG_NOERROR;
484	}
485
486	/* Skip characters until the index becomes greater than NEW_RAW_IDX.
487	Return the index. */
488
489	static Idx
490	internal_function
491	re_string_skip_chars (re_string_t pstr, Idx new_raw_idx, wint_t last_wc)
492	{
493	mbstate_t prev_st;
494	Idx rawbuf_idx;
495	size_t mbclen;
496	wint_t wc = WEOF;
497
498	/* Skip the characters which are not necessary to check. */
499	for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
500	rawbuf_idx < new_raw_idx;)
501	{
502	wchar_t wc2;
503	Idx remain_len = pstr->raw_len - rawbuf_idx;
504	prev_st = pstr->cur_state;
505	mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
506	remain_len, &pstr->cur_state);
507	if (BE (mbclen == (size_t) -2 \|\| mbclen == (size_t) -1 \|\| mbclen == 0, 0))
508	{
509	/* We treat these cases as a single byte character. */
510	if (mbclen == 0 \|\| remain_len == 0)
511	wc = L'\0';
512	else
513	wc = (unsigned char ) (pstr->raw_mbs + rawbuf_idx);
514	mbclen = 1;
515	pstr->cur_state = prev_st;
516	}
517	else
518	wc = wc2;
519	/* Then proceed the next character. */
520	rawbuf_idx += mbclen;
521	}
522	*last_wc = wc;
523	return rawbuf_idx;
524	}
525	#endif /* RE_ENABLE_I18N */
526
527	/* Build the buffer PSTR->MBS, and apply the translation if we need.
528	This function is used in case of REG_ICASE. */
529
530	static void
531	internal_function
532	build_upper_buffer (re_string_t *pstr)
533	{
534	Idx char_idx, end_idx;
535	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
536
537	for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
538	{
539	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
540	if (BE (pstr->trans != NULL, 0))
541	ch = pstr->trans[ch];
542	pstr->mbs[char_idx] = toupper (ch);
543	}
544	pstr->valid_len = char_idx;
545	pstr->valid_raw_len = char_idx;
546	}
547
548	/* Apply TRANS to the buffer in PSTR. */
549
550	static void
551	internal_function
552	re_string_translate_buffer (re_string_t *pstr)
553	{
554	Idx buf_idx, end_idx;
555	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
556
557	for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
558	{
559	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
560	pstr->mbs[buf_idx] = pstr->trans[ch];
561	}
562
563	pstr->valid_len = buf_idx;
564	pstr->valid_raw_len = buf_idx;
565	}
566
567	/* This function re-construct the buffers.
568	Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
569	convert to upper case in case of REG_ICASE, apply translation. */
570
571	static reg_errcode_t
572	internal_function __attribute_warn_unused_result__
573	re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
574	{
575	Idx offset;
576
577	if (BE (pstr->raw_mbs_idx <= idx, 0))
578	offset = idx - pstr->raw_mbs_idx;
579	else
580	{
581	/* Reset buffer. */
582	#ifdef RE_ENABLE_I18N
583	if (pstr->mb_cur_max > 1)
584	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
585	#endif /* RE_ENABLE_I18N */
586	pstr->len = pstr->raw_len;
587	pstr->stop = pstr->raw_stop;
588	pstr->valid_len = 0;
589	pstr->raw_mbs_idx = 0;
590	pstr->valid_raw_len = 0;
591	pstr->offsets_needed = 0;
592	pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
593	: CONTEXT_NEWLINE \| CONTEXT_BEGBUF);
594	if (!pstr->mbs_allocated)
595	pstr->mbs = (unsigned char *) pstr->raw_mbs;
596	offset = idx;
597	}
598
599	if (BE (offset != 0, 1))
600	{
601	/* Should the already checked characters be kept? */
602	if (BE (offset < pstr->valid_raw_len, 1))
603	{
604	/* Yes, move them to the front of the buffer. */
605	#ifdef RE_ENABLE_I18N
606	if (BE (pstr->offsets_needed, 0))
607	{
608	Idx low = 0, high = pstr->valid_len, mid;
609	do
610	{
611	mid = (high + low) / 2;
612	if (pstr->offsets[mid] > offset)
613	high = mid;
614	else if (pstr->offsets[mid] < offset)
615	low = mid + 1;
616	else
617	break;
618	}
619	while (low < high);
620	if (pstr->offsets[mid] < offset)
621	++mid;
622	pstr->tip_context = re_string_context_at (pstr, mid - 1,
623	eflags);
624	/* This can be quite complicated, so handle specially
625	only the common and easy case where the character with
626	different length representation of lower and upper
627	case is present at or after offset. */
628	if (pstr->valid_len > offset
629	&& mid == offset && pstr->offsets[mid] == offset)
630	{
631	memmove (pstr->wcs, pstr->wcs + offset,
632	(pstr->valid_len - offset) * sizeof (wint_t));
633	memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
634	pstr->valid_len -= offset;
635	pstr->valid_raw_len -= offset;
636	for (low = 0; low < pstr->valid_len; low++)
637	pstr->offsets[low] = pstr->offsets[low + offset] - offset;
638	}
639	else
640	{
641	/* Otherwise, just find out how long the partial multibyte
642	character at offset is and fill it with WEOF/255. */
643	pstr->len = pstr->raw_len - idx + offset;
644	pstr->stop = pstr->raw_stop - idx + offset;
645	pstr->offsets_needed = 0;
646	while (mid > 0 && pstr->offsets[mid - 1] == offset)
647	--mid;
648	while (mid < pstr->valid_len)
649	if (pstr->wcs[mid] != WEOF)
650	break;
651	else
652	++mid;
653	if (mid == pstr->valid_len)
654	pstr->valid_len = 0;
655	else
656	{
657	pstr->valid_len = pstr->offsets[mid] - offset;
658	if (pstr->valid_len)
659	{
660	for (low = 0; low < pstr->valid_len; ++low)
661	pstr->wcs[low] = WEOF;
662	memset (pstr->mbs, 255, pstr->valid_len);
663	}
664	}
665	pstr->valid_raw_len = pstr->valid_len;
666	}
667	}
668	else
669	#endif
670	{
671	pstr->tip_context = re_string_context_at (pstr, offset - 1,
672	eflags);
673	#ifdef RE_ENABLE_I18N
674	if (pstr->mb_cur_max > 1)
675	memmove (pstr->wcs, pstr->wcs + offset,
676	(pstr->valid_len - offset) * sizeof (wint_t));
677	#endif /* RE_ENABLE_I18N */
678	if (BE (pstr->mbs_allocated, 0))
679	memmove (pstr->mbs, pstr->mbs + offset,
680	pstr->valid_len - offset);
681	pstr->valid_len -= offset;
682	pstr->valid_raw_len -= offset;
683	#if defined DEBUG && DEBUG
684	assert (pstr->valid_len > 0);
685	#endif
686	}
687	}
688	else
689	{
690	#ifdef RE_ENABLE_I18N
691	/* No, skip all characters until IDX. */
692	Idx prev_valid_len = pstr->valid_len;
693
694	if (BE (pstr->offsets_needed, 0))
695	{
696	pstr->len = pstr->raw_len - idx + offset;
697	pstr->stop = pstr->raw_stop - idx + offset;
698	pstr->offsets_needed = 0;
699	}
700	#endif
701	pstr->valid_len = 0;
702	#ifdef RE_ENABLE_I18N
703	if (pstr->mb_cur_max > 1)
704	{
705	Idx wcs_idx;
706	wint_t wc = WEOF;
707
708	if (pstr->is_utf8)
709	{
710	const unsigned char raw, p, *end;
711
712	/* Special case UTF-8. Multi-byte chars start with any
713	byte other than 0x80 - 0xbf. */
714	raw = pstr->raw_mbs + pstr->raw_mbs_idx;
715	end = raw + (offset - pstr->mb_cur_max);
716	if (end < pstr->raw_mbs)
717	end = pstr->raw_mbs;
718	p = raw + offset - 1;
719	#ifdef _LIBC
720	/* We know the wchar_t encoding is UCS4, so for the simple
721	case, ASCII characters, skip the conversion step. */
722	if (isascii (*p) && BE (pstr->trans == NULL, 1))
723	{
724	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
725	/* pstr->valid_len = 0; */
726	wc = (wchar_t) *p;
727	}
728	else
729	#endif
730	for (; p >= end; --p)
731	if ((*p & 0xc0) != 0x80)
732	{
733	mbstate_t cur_state;
734	wchar_t wc2;
735	Idx mlen = raw + pstr->len - p;
736	unsigned char buf[6];
737	size_t mbclen;
738
739	const unsigned char *pp = p;
740	if (BE (pstr->trans != NULL, 0))
741	{
742	int i = mlen < 6 ? mlen : 6;
743	while (--i >= 0)
744	buf[i] = pstr->trans[p[i]];
745	pp = buf;
746	}
747	/* XXX Don't use mbrtowc, we know which conversion
748	to use (UTF-8 -> UCS4). */
749	memset (&cur_state, 0, sizeof (cur_state));
750	mbclen = __mbrtowc (&wc2, (const char *) pp, mlen,
751	&cur_state);
752	if (raw + offset - p <= mbclen
753	&& mbclen < (size_t) -2)
754	{
755	memset (&pstr->cur_state, '\0',
756	sizeof (mbstate_t));
757	pstr->valid_len = mbclen - (raw + offset - p);
758	wc = wc2;
759	}
760	break;
761	}
762	}
763
764	if (wc == WEOF)
765	pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
766	if (wc == WEOF)
767	pstr->tip_context
768	= re_string_context_at (pstr, prev_valid_len - 1, eflags);
769	else
770	pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
771	&& IS_WIDE_WORD_CHAR (wc))
772	? CONTEXT_WORD
773	: ((IS_WIDE_NEWLINE (wc)
774	&& pstr->newline_anchor)
775	? CONTEXT_NEWLINE : 0));
776	if (BE (pstr->valid_len, 0))
777	{
778	for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
779	pstr->wcs[wcs_idx] = WEOF;
780	if (pstr->mbs_allocated)
781	memset (pstr->mbs, 255, pstr->valid_len);
782	}
783	pstr->valid_raw_len = pstr->valid_len;
784	}
785	else
786	#endif /* RE_ENABLE_I18N */
787	{
788	int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
789	pstr->valid_raw_len = 0;
790	if (pstr->trans)
791	c = pstr->trans[c];
792	pstr->tip_context = (bitset_contain (pstr->word_char, c)
793	? CONTEXT_WORD
794	: ((IS_NEWLINE (c) && pstr->newline_anchor)
795	? CONTEXT_NEWLINE : 0));
796	}
797	}
798	if (!BE (pstr->mbs_allocated, 0))
799	pstr->mbs += offset;
800	}
801	pstr->raw_mbs_idx = idx;
802	pstr->len -= offset;
803	pstr->stop -= offset;
804
805	/* Then build the buffers. */
806	#ifdef RE_ENABLE_I18N
807	if (pstr->mb_cur_max > 1)
808	{
809	if (pstr->icase)
810	{
811	reg_errcode_t ret = build_wcs_upper_buffer (pstr);
812	if (BE (ret != REG_NOERROR, 0))
813	return ret;
814	}
815	else
816	build_wcs_buffer (pstr);
817	}
818	else
819	#endif /* RE_ENABLE_I18N */
820	if (BE (pstr->mbs_allocated, 0))
821	{
822	if (pstr->icase)
823	build_upper_buffer (pstr);
824	else if (pstr->trans != NULL)
825	re_string_translate_buffer (pstr);
826	}
827	else
828	pstr->valid_len = pstr->len;
829
830	pstr->cur_idx = 0;
831	return REG_NOERROR;
832	}
833
834	static unsigned char
835	internal_function __attribute__ ((pure))
836	re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
837	{
838	int ch;
839	Idx off;
840
841	/* Handle the common (easiest) cases first. */
842	if (BE (!pstr->mbs_allocated, 1))
843	return re_string_peek_byte (pstr, idx);
844
845	#ifdef RE_ENABLE_I18N
846	if (pstr->mb_cur_max > 1
847	&& ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
848	return re_string_peek_byte (pstr, idx);
849	#endif
850
851	off = pstr->cur_idx + idx;
852	#ifdef RE_ENABLE_I18N
853	if (pstr->offsets_needed)
854	off = pstr->offsets[off];
855	#endif
856
857	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
858
859	#ifdef RE_ENABLE_I18N
860	/* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
861	this function returns CAPITAL LETTER I instead of first byte of
862	DOTLESS SMALL LETTER I. The latter would confuse the parser,
863	since peek_byte_case doesn't advance cur_idx in any way. */
864	if (pstr->offsets_needed && !isascii (ch))
865	return re_string_peek_byte (pstr, idx);
866	#endif
867
868	return ch;
869	}
870
871	static unsigned char
872	internal_function
873	re_string_fetch_byte_case (re_string_t *pstr)
874	{
875	if (BE (!pstr->mbs_allocated, 1))
876	return re_string_fetch_byte (pstr);
877
878	#ifdef RE_ENABLE_I18N
879	if (pstr->offsets_needed)
880	{
881	Idx off;
882	int ch;
883
884	/* For tr_TR.UTF-8 [[:islower:]] there is
885	[[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
886	in that case the whole multi-byte character and return
887	the original letter. On the other side, with
888	[[: DOTLESS SMALL LETTER I return [[:I, as doing
889	anything else would complicate things too much. */
890
891	if (!re_string_first_byte (pstr, pstr->cur_idx))
892	return re_string_fetch_byte (pstr);
893
894	off = pstr->offsets[pstr->cur_idx];
895	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
896
897	if (! isascii (ch))
898	return re_string_fetch_byte (pstr);
899
900	re_string_skip_bytes (pstr,
901	re_string_char_size_at (pstr, pstr->cur_idx));
902	return ch;
903	}
904	#endif
905
906	return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
907	}
908
909	static void
910	internal_function
911	re_string_destruct (re_string_t *pstr)
912	{
913	#ifdef RE_ENABLE_I18N
914	re_free (pstr->wcs);
915	re_free (pstr->offsets);
916	#endif /* RE_ENABLE_I18N */
917	if (pstr->mbs_allocated)
918	re_free (pstr->mbs);
919	}
920
921	/* Return the context at IDX in INPUT. */
922
923	static unsigned int
924	internal_function
925	re_string_context_at (const re_string_t *input, Idx idx, int eflags)
926	{
927	int c;
928	if (BE (! REG_VALID_INDEX (idx), 0))
929	/* In this case, we use the value stored in input->tip_context,
930	since we can't know the character in input->mbs[-1] here. */
931	return input->tip_context;
932	if (BE (idx == input->len, 0))
933	return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
934	: CONTEXT_NEWLINE \| CONTEXT_ENDBUF);
935	#ifdef RE_ENABLE_I18N
936	if (input->mb_cur_max > 1)
937	{
938	wint_t wc;
939	Idx wc_idx = idx;
940	while(input->wcs[wc_idx] == WEOF)
941	{
942	#if defined DEBUG && DEBUG
943	/* It must not happen. */
944	assert (REG_VALID_INDEX (wc_idx));
945	#endif
946	--wc_idx;
947	if (! REG_VALID_INDEX (wc_idx))
948	return input->tip_context;
949	}
950	wc = input->wcs[wc_idx];
951	if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
952	return CONTEXT_WORD;
953	return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
954	? CONTEXT_NEWLINE : 0);
955	}
956	else
957	#endif
958	{
959	c = re_string_byte_at (input, idx);
960	if (bitset_contain (input->word_char, c))
961	return CONTEXT_WORD;
962	return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
963	}
964	}
965
966
967	/* Functions for set operation. */
968
969	static reg_errcode_t
970	internal_function __attribute_warn_unused_result__
971	re_node_set_alloc (re_node_set *set, Idx size)
972	{
973	set->alloc = size;
974	set->nelem = 0;
975	set->elems = re_malloc (Idx, size);
976	if (BE (set->elems == NULL, 0) && (MALLOC_0_IS_NONNULL \|\| size != 0))
977	return REG_ESPACE;
978	return REG_NOERROR;
979	}
980
981	static reg_errcode_t
982	internal_function __attribute_warn_unused_result__
983	re_node_set_init_1 (re_node_set *set, Idx elem)
984	{
985	set->alloc = 1;
986	set->nelem = 1;
987	set->elems = re_malloc (Idx, 1);
988	if (BE (set->elems == NULL, 0))
989	{
990	set->alloc = set->nelem = 0;
991	return REG_ESPACE;
992	}
993	set->elems[0] = elem;
994	return REG_NOERROR;
995	}
996
997	static reg_errcode_t
998	internal_function __attribute_warn_unused_result__
999	re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
1000	{
1001	set->alloc = 2;
1002	set->elems = re_malloc (Idx, 2);
1003	if (BE (set->elems == NULL, 0))
1004	return REG_ESPACE;
1005	if (elem1 == elem2)
1006	{
1007	set->nelem = 1;
1008	set->elems[0] = elem1;
1009	}
1010	else
1011	{
1012	set->nelem = 2;
1013	if (elem1 < elem2)
1014	{
1015	set->elems[0] = elem1;
1016	set->elems[1] = elem2;
1017	}
1018	else
1019	{
1020	set->elems[0] = elem2;
1021	set->elems[1] = elem1;
1022	}
1023	}
1024	return REG_NOERROR;
1025	}
1026
1027	static reg_errcode_t
1028	internal_function __attribute_warn_unused_result__
1029	re_node_set_init_copy (re_node_set dest, const re_node_set src)
1030	{
1031	dest->nelem = src->nelem;
1032	if (src->nelem > 0)
1033	{
1034	dest->alloc = dest->nelem;
1035	dest->elems = re_malloc (Idx, dest->alloc);
1036	if (BE (dest->elems == NULL, 0))
1037	{
1038	dest->alloc = dest->nelem = 0;
1039	return REG_ESPACE;
1040	}
1041	memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1042	}
1043	else
1044	re_node_set_init_empty (dest);
1045	return REG_NOERROR;
1046	}
1047
1048	/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
1049	DEST. Return value indicate the error code or REG_NOERROR if succeeded.
1050	Note: We assume dest->elems is NULL, when dest->alloc is 0. */
1051
1052	static reg_errcode_t
1053	internal_function __attribute_warn_unused_result__
1054	re_node_set_add_intersect (re_node_set dest, const re_node_set src1,
1055	const re_node_set *src2)
1056	{
1057	Idx i1, i2, is, id, delta, sbase;
1058	if (src1->nelem == 0 \|\| src2->nelem == 0)
1059	return REG_NOERROR;
1060
1061	/* We need dest->nelem + 2 * elems_in_intersection; this is a
1062	conservative estimate. */
1063	if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
1064	{
1065	Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
1066	Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
1067	if (BE (new_elems == NULL, 0))
1068	return REG_ESPACE;
1069	dest->elems = new_elems;
1070	dest->alloc = new_alloc;
1071	}
1072
1073	/* Find the items in the intersection of SRC1 and SRC2, and copy
1074	into the top of DEST those that are not already in DEST itself. */
1075	sbase = dest->nelem + src1->nelem + src2->nelem;
1076	i1 = src1->nelem - 1;
1077	i2 = src2->nelem - 1;
1078	id = dest->nelem - 1;
1079	for (;;)
1080	{
1081	if (src1->elems[i1] == src2->elems[i2])
1082	{
1083	/* Try to find the item in DEST. Maybe we could binary search? */
1084	while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
1085	--id;
1086
1087	if (! REG_VALID_INDEX (id) \|\| dest->elems[id] != src1->elems[i1])
1088	dest->elems[--sbase] = src1->elems[i1];
1089
1090	if (! REG_VALID_INDEX (--i1) \|\| ! REG_VALID_INDEX (--i2))
1091	break;
1092	}
1093
1094	/* Lower the highest of the two items. */
1095	else if (src1->elems[i1] < src2->elems[i2])
1096	{
1097	if (! REG_VALID_INDEX (--i2))
1098	break;
1099	}
1100	else
1101	{
1102	if (! REG_VALID_INDEX (--i1))
1103	break;
1104	}
1105	}
1106
1107	id = dest->nelem - 1;
1108	is = dest->nelem + src1->nelem + src2->nelem - 1;
1109	delta = is - sbase + 1;
1110
1111	/* Now copy. When DELTA becomes zero, the remaining
1112	DEST elements are already in place; this is more or
1113	less the same loop that is in re_node_set_merge. */
1114	dest->nelem += delta;
1115	if (delta > 0 && REG_VALID_INDEX (id))
1116	for (;;)
1117	{
1118	if (dest->elems[is] > dest->elems[id])
1119	{
1120	/* Copy from the top. */
1121	dest->elems[id + delta--] = dest->elems[is--];
1122	if (delta == 0)
1123	break;
1124	}
1125	else
1126	{
1127	/* Slide from the bottom. */
1128	dest->elems[id + delta] = dest->elems[id];
1129	if (! REG_VALID_INDEX (--id))
1130	break;
1131	}
1132	}
1133
1134	/* Copy remaining SRC elements. */
1135	memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
1136
1137	return REG_NOERROR;
1138	}
1139
1140	/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1141	DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1142
1143	static reg_errcode_t
1144	internal_function __attribute_warn_unused_result__
1145	re_node_set_init_union (re_node_set dest, const re_node_set src1,
1146	const re_node_set *src2)
1147	{
1148	Idx i1, i2, id;
1149	if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1150	{
1151	dest->alloc = src1->nelem + src2->nelem;
1152	dest->elems = re_malloc (Idx, dest->alloc);
1153	if (BE (dest->elems == NULL, 0))
1154	return REG_ESPACE;
1155	}
1156	else
1157	{
1158	if (src1 != NULL && src1->nelem > 0)
1159	return re_node_set_init_copy (dest, src1);
1160	else if (src2 != NULL && src2->nelem > 0)
1161	return re_node_set_init_copy (dest, src2);
1162	else
1163	re_node_set_init_empty (dest);
1164	return REG_NOERROR;
1165	}
1166	for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1167	{
1168	if (src1->elems[i1] > src2->elems[i2])
1169	{
1170	dest->elems[id++] = src2->elems[i2++];
1171	continue;
1172	}
1173	if (src1->elems[i1] == src2->elems[i2])
1174	++i2;
1175	dest->elems[id++] = src1->elems[i1++];
1176	}
1177	if (i1 < src1->nelem)
1178	{
1179	memcpy (dest->elems + id, src1->elems + i1,
1180	(src1->nelem - i1) * sizeof (Idx));
1181	id += src1->nelem - i1;
1182	}
1183	else if (i2 < src2->nelem)
1184	{
1185	memcpy (dest->elems + id, src2->elems + i2,
1186	(src2->nelem - i2) * sizeof (Idx));
1187	id += src2->nelem - i2;
1188	}
1189	dest->nelem = id;
1190	return REG_NOERROR;
1191	}
1192
1193	/* Calculate the union set of the sets DEST and SRC. And store it to
1194	DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1195
1196	static reg_errcode_t
1197	internal_function __attribute_warn_unused_result__
1198	re_node_set_merge (re_node_set dest, const re_node_set src)
1199	{
1200	Idx is, id, sbase, delta;
1201	if (src == NULL \|\| src->nelem == 0)
1202	return REG_NOERROR;
1203	if (dest->alloc < 2 * src->nelem + dest->nelem)
1204	{
1205	Idx new_alloc = 2 * (src->nelem + dest->alloc);
1206	Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
1207	if (BE (new_buffer == NULL, 0))
1208	return REG_ESPACE;
1209	dest->elems = new_buffer;
1210	dest->alloc = new_alloc;
1211	}
1212
1213	if (BE (dest->nelem == 0, 0))
1214	{
1215	dest->nelem = src->nelem;
1216	memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
1217	return REG_NOERROR;
1218	}
1219
1220	/* Copy into the top of DEST the items of SRC that are not
1221	found in DEST. Maybe we could binary search in DEST? */
1222	for (sbase = dest->nelem + 2 * src->nelem,
1223	is = src->nelem - 1, id = dest->nelem - 1;
1224	REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
1225	{
1226	if (dest->elems[id] == src->elems[is])
1227	is--, id--;
1228	else if (dest->elems[id] < src->elems[is])
1229	dest->elems[--sbase] = src->elems[is--];
1230	else /* if (dest->elems[id] > src->elems[is]) */
1231	--id;
1232	}
1233
1234	if (REG_VALID_INDEX (is))
1235	{
1236	/* If DEST is exhausted, the remaining items of SRC must be unique. */
1237	sbase -= is + 1;
1238	memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
1239	}
1240
1241	id = dest->nelem - 1;
1242	is = dest->nelem + 2 * src->nelem - 1;
1243	delta = is - sbase + 1;
1244	if (delta == 0)
1245	return REG_NOERROR;
1246
1247	/* Now copy. When DELTA becomes zero, the remaining
1248	DEST elements are already in place. */
1249	dest->nelem += delta;
1250	for (;;)
1251	{
1252	if (dest->elems[is] > dest->elems[id])
1253	{
1254	/* Copy from the top. */
1255	dest->elems[id + delta--] = dest->elems[is--];
1256	if (delta == 0)
1257	break;
1258	}
1259	else
1260	{
1261	/* Slide from the bottom. */
1262	dest->elems[id + delta] = dest->elems[id];
1263	if (! REG_VALID_INDEX (--id))
1264	{
1265	/* Copy remaining SRC elements. */
1266	memcpy (dest->elems, dest->elems + sbase,
1267	delta * sizeof (Idx));
1268	break;
1269	}
1270	}
1271	}
1272
1273	return REG_NOERROR;
1274	}
1275
1276	/* Insert the new element ELEM to the re_node_set* SET.
1277	SET should not already have ELEM.
1278	Return true if successful. */
1279
1280	static bool
1281	internal_function __attribute_warn_unused_result__
1282	re_node_set_insert (re_node_set *set, Idx elem)
1283	{
1284	Idx idx;
1285	/* In case the set is empty. */
1286	if (set->alloc == 0)
1287	return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
1288
1289	if (BE (set->nelem, 0) == 0)
1290	{
1291	/* We already guaranteed above that set->alloc != 0. */
1292	set->elems[0] = elem;
1293	++set->nelem;
1294	return true;
1295	}
1296
1297	/* Realloc if we need. */
1298	if (set->alloc == set->nelem)
1299	{
1300	Idx *new_elems;
1301	set->alloc = set->alloc * 2;
1302	new_elems = re_realloc (set->elems, Idx, set->alloc);
1303	if (BE (new_elems == NULL, 0))
1304	return false;
1305	set->elems = new_elems;
1306	}
1307
1308	/* Move the elements which follows the new element. Test the
1309	first element separately to skip a check in the inner loop. */
1310	if (elem < set->elems[0])
1311	{
1312	idx = 0;
1313	for (idx = set->nelem; idx > 0; idx--)
1314	set->elems[idx] = set->elems[idx - 1];
1315	}
1316	else
1317	{
1318	for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1319	set->elems[idx] = set->elems[idx - 1];
1320	}
1321
1322	/* Insert the new element. */
1323	set->elems[idx] = elem;
1324	++set->nelem;
1325	return true;
1326	}
1327
1328	/* Insert the new element ELEM to the re_node_set* SET.
1329	SET should not already have any element greater than or equal to ELEM.
1330	Return true if successful. */
1331
1332	static bool
1333	internal_function __attribute_warn_unused_result__
1334	re_node_set_insert_last (re_node_set *set, Idx elem)
1335	{
1336	/* Realloc if we need. */
1337	if (set->alloc == set->nelem)
1338	{
1339	Idx *new_elems;
1340	set->alloc = (set->alloc + 1) * 2;
1341	new_elems = re_realloc (set->elems, Idx, set->alloc);
1342	if (BE (new_elems == NULL, 0))
1343	return false;
1344	set->elems = new_elems;
1345	}
1346
1347	/* Insert the new element. */
1348	set->elems[set->nelem++] = elem;
1349	return true;
1350	}
1351
1352	/* Compare two node sets SET1 and SET2.
1353	Return true if SET1 and SET2 are equivalent. */
1354
1355	static bool
1356	internal_function __attribute__ ((pure))
1357	re_node_set_compare (const re_node_set set1, const re_node_set set2)
1358	{
1359	Idx i;
1360	if (set1 == NULL \|\| set2 == NULL \|\| set1->nelem != set2->nelem)
1361	return false;
1362	for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
1363	if (set1->elems[i] != set2->elems[i])
1364	return false;
1365	return true;
1366	}
1367
1368	/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1369
1370	static Idx
1371	internal_function __attribute__ ((pure))
1372	re_node_set_contains (const re_node_set *set, Idx elem)
1373	{
1374	__re_size_t idx, right, mid;
1375	if (! REG_VALID_NONZERO_INDEX (set->nelem))
1376	return 0;
1377
1378	/* Binary search the element. */
1379	idx = 0;
1380	right = set->nelem - 1;
1381	while (idx < right)
1382	{
1383	mid = (idx + right) / 2;
1384	if (set->elems[mid] < elem)
1385	idx = mid + 1;
1386	else
1387	right = mid;
1388	}
1389	return set->elems[idx] == elem ? idx + 1 : 0;
1390	}
1391
1392	static void
1393	internal_function
1394	re_node_set_remove_at (re_node_set *set, Idx idx)
1395	{
1396	verify (! TYPE_SIGNED (Idx));
1397	/* if (idx < 0)
1398	return; */
1399	if (idx >= set->nelem)
1400	return;
1401	--set->nelem;
1402	for (; idx < set->nelem; idx++)
1403	set->elems[idx] = set->elems[idx + 1];
1404	}
1405
1406
1407
1408	/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1409	Or return REG_MISSING if an error occurred. */
1410
1411	static Idx
1412	internal_function
1413	re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1414	{
1415	if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1416	{
1417	size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1418	Idx new_nexts, new_indices;
1419	re_node_set new_edests, new_eclosures;
1420	re_token_t *new_nodes;
1421
1422	/* Avoid overflows in realloc. */
1423	const size_t max_object_size = MAX (sizeof (re_token_t),
1424	MAX (sizeof (re_node_set),
1425	sizeof (Idx)));
1426	if (BE (MIN (IDX_MAX, SIZE_MAX / max_object_size) < new_nodes_alloc, 0))
1427	return REG_MISSING;
1428
1429	new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1430	if (BE (new_nodes == NULL, 0))
1431	return REG_MISSING;
1432	dfa->nodes = new_nodes;
1433	new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
1434	new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
1435	new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1436	new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1437	if (BE (new_nexts == NULL \|\| new_indices == NULL
1438	\|\| new_edests == NULL \|\| new_eclosures == NULL, 0))
1439	return REG_MISSING;
1440	dfa->nexts = new_nexts;
1441	dfa->org_indices = new_indices;
1442	dfa->edests = new_edests;
1443	dfa->eclosures = new_eclosures;
1444	dfa->nodes_alloc = new_nodes_alloc;
1445	}
1446	dfa->nodes[dfa->nodes_len] = token;
1447	dfa->nodes[dfa->nodes_len].constraint = 0;
1448	#ifdef RE_ENABLE_I18N
1449	dfa->nodes[dfa->nodes_len].accept_mb =
1450	((token.type == OP_PERIOD && dfa->mb_cur_max > 1)
1451	\|\| token.type == COMPLEX_BRACKET);
1452	#endif
1453	dfa->nexts[dfa->nodes_len] = REG_MISSING;
1454	re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1455	re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1456	return dfa->nodes_len++;
1457	}
1458
1459	static re_hashval_t
1460	internal_function
1461	calc_state_hash (const re_node_set *nodes, unsigned int context)
1462	{
1463	re_hashval_t hash = nodes->nelem + context;
1464	Idx i;
1465	for (i = 0 ; i < nodes->nelem ; i++)
1466	hash += nodes->elems[i];
1467	return hash;
1468	}
1469
1470	/* Search for the state whose node_set is equivalent to NODES.
1471	Return the pointer to the state, if we found it in the DFA.
1472	Otherwise create the new one and return it. In case of an error
1473	return NULL and set the error code in ERR.
1474	Note: - We assume NULL as the invalid state, then it is possible that
1475	return value is NULL and ERR is REG_NOERROR.
1476	- We never return non-NULL value in case of any errors, it is for
1477	optimization. */
1478
1479	static re_dfastate_t *
1480	internal_function __attribute_warn_unused_result__
1481	re_acquire_state (reg_errcode_t err, const re_dfa_t dfa,
1482	const re_node_set *nodes)
1483	{
1484	re_hashval_t hash;
1485	re_dfastate_t *new_state;
1486	struct re_state_table_entry *spot;
1487	Idx i;
1488	#ifdef lint
1489	/* Suppress bogus uninitialized-variable warnings. */
1490	*err = REG_NOERROR;
1491	#endif
1492	if (BE (nodes->nelem == 0, 0))
1493	{
1494	*err = REG_NOERROR;
1495	return NULL;
1496	}
1497	hash = calc_state_hash (nodes, 0);
1498	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1499
1500	for (i = 0 ; i < spot->num ; i++)
1501	{
1502	re_dfastate_t *state = spot->array[i];
1503	if (hash != state->hash)
1504	continue;
1505	if (re_node_set_compare (&state->nodes, nodes))
1506	return state;
1507	}
1508
1509	/* There are no appropriate state in the dfa, create the new one. */
1510	new_state = create_ci_newstate (dfa, nodes, hash);
1511	if (BE (new_state == NULL, 0))
1512	*err = REG_ESPACE;
1513
1514	return new_state;
1515	}
1516
1517	/* Search for the state whose node_set is equivalent to NODES and
1518	whose context is equivalent to CONTEXT.
1519	Return the pointer to the state, if we found it in the DFA.
1520	Otherwise create the new one and return it. In case of an error
1521	return NULL and set the error code in ERR.
1522	Note: - We assume NULL as the invalid state, then it is possible that
1523	return value is NULL and ERR is REG_NOERROR.
1524	- We never return non-NULL value in case of any errors, it is for
1525	optimization. */
1526
1527	static re_dfastate_t *
1528	internal_function __attribute_warn_unused_result__
1529	re_acquire_state_context (reg_errcode_t err, const re_dfa_t dfa,
1530	const re_node_set *nodes, unsigned int context)
1531	{
1532	re_hashval_t hash;
1533	re_dfastate_t *new_state;
1534	struct re_state_table_entry *spot;
1535	Idx i;
1536	#ifdef lint
1537	/* Suppress bogus uninitialized-variable warnings. */
1538	*err = REG_NOERROR;
1539	#endif
1540	if (nodes->nelem == 0)
1541	{
1542	*err = REG_NOERROR;
1543	return NULL;
1544	}
1545	hash = calc_state_hash (nodes, context);
1546	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1547
1548	for (i = 0 ; i < spot->num ; i++)
1549	{
1550	re_dfastate_t *state = spot->array[i];
1551	if (state->hash == hash
1552	&& state->context == context
1553	&& re_node_set_compare (state->entrance_nodes, nodes))
1554	return state;
1555	}
1556	/* There are no appropriate state in 'dfa', create the new one. */
1557	new_state = create_cd_newstate (dfa, nodes, context, hash);
1558	if (BE (new_state == NULL, 0))
1559	*err = REG_ESPACE;
1560
1561	return new_state;
1562	}
1563
1564	/* Finish initialization of the new state NEWSTATE, and using its hash value
1565	HASH put in the appropriate bucket of DFA's state table. Return value
1566	indicates the error code if failed. */
1567
1568	static reg_errcode_t
1569	__attribute_warn_unused_result__
1570	register_state (const re_dfa_t dfa, re_dfastate_t newstate,
1571	re_hashval_t hash)
1572	{
1573	struct re_state_table_entry *spot;
1574	reg_errcode_t err;
1575	Idx i;
1576
1577	newstate->hash = hash;
1578	err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1579	if (BE (err != REG_NOERROR, 0))
1580	return REG_ESPACE;
1581	for (i = 0; i < newstate->nodes.nelem; i++)
1582	{
1583	Idx elem = newstate->nodes.elems[i];
1584	if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1585	if (! re_node_set_insert_last (&newstate->non_eps_nodes, elem))
1586	return REG_ESPACE;
1587	}
1588
1589	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1590	if (BE (spot->alloc <= spot->num, 0))
1591	{
1592	Idx new_alloc = 2 * spot->num + 2;
1593	re_dfastate_t *new_array = re_realloc (spot->array, re_dfastate_t ,
1594	new_alloc);
1595	if (BE (new_array == NULL, 0))
1596	return REG_ESPACE;
1597	spot->array = new_array;
1598	spot->alloc = new_alloc;
1599	}
1600	spot->array[spot->num++] = newstate;
1601	return REG_NOERROR;
1602	}
1603
1604	static void
1605	free_state (re_dfastate_t *state)
1606	{
1607	re_node_set_free (&state->non_eps_nodes);
1608	re_node_set_free (&state->inveclosure);
1609	if (state->entrance_nodes != &state->nodes)
1610	{
1611	re_node_set_free (state->entrance_nodes);
1612	re_free (state->entrance_nodes);
1613	}
1614	re_node_set_free (&state->nodes);
1615	re_free (state->word_trtable);
1616	re_free (state->trtable);
1617	re_free (state);
1618	}
1619
1620	/* Create the new state which is independent of contexts.
1621	Return the new state if succeeded, otherwise return NULL. */
1622
1623	static re_dfastate_t *
1624	internal_function __attribute_warn_unused_result__
1625	create_ci_newstate (const re_dfa_t dfa, const re_node_set nodes,
1626	re_hashval_t hash)
1627	{
1628	Idx i;
1629	reg_errcode_t err;
1630	re_dfastate_t *newstate;
1631
1632	newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1633	if (BE (newstate == NULL, 0))
1634	return NULL;
1635	err = re_node_set_init_copy (&newstate->nodes, nodes);
1636	if (BE (err != REG_NOERROR, 0))
1637	{
1638	re_free (newstate);
1639	return NULL;
1640	}
1641
1642	newstate->entrance_nodes = &newstate->nodes;
1643	for (i = 0 ; i < nodes->nelem ; i++)
1644	{
1645	re_token_t *node = dfa->nodes + nodes->elems[i];
1646	re_token_type_t type = node->type;
1647	if (type == CHARACTER && !node->constraint)
1648	continue;
1649	#ifdef RE_ENABLE_I18N
1650	newstate->accept_mb \|= node->accept_mb;
1651	#endif /* RE_ENABLE_I18N */
1652
1653	/* If the state has the halt node, the state is a halt state. */
1654	if (type == END_OF_RE)
1655	newstate->halt = 1;
1656	else if (type == OP_BACK_REF)
1657	newstate->has_backref = 1;
1658	else if (type == ANCHOR \|\| node->constraint)
1659	newstate->has_constraint = 1;
1660	}
1661	err = register_state (dfa, newstate, hash);
1662	if (BE (err != REG_NOERROR, 0))
1663	{
1664	free_state (newstate);
1665	newstate = NULL;
1666	}
1667	return newstate;
1668	}
1669
1670	/* Create the new state which is depend on the context CONTEXT.
1671	Return the new state if succeeded, otherwise return NULL. */
1672
1673	static re_dfastate_t *
1674	internal_function __attribute_warn_unused_result__
1675	create_cd_newstate (const re_dfa_t dfa, const re_node_set nodes,
1676	unsigned int context, re_hashval_t hash)
1677	{
1678	Idx i, nctx_nodes = 0;
1679	reg_errcode_t err;
1680	re_dfastate_t *newstate;
1681
1682	newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1683	if (BE (newstate == NULL, 0))
1684	return NULL;
1685	err = re_node_set_init_copy (&newstate->nodes, nodes);
1686	if (BE (err != REG_NOERROR, 0))
1687	{
1688	re_free (newstate);
1689	return NULL;
1690	}
1691
1692	newstate->context = context;
1693	newstate->entrance_nodes = &newstate->nodes;
1694
1695	for (i = 0 ; i < nodes->nelem ; i++)
1696	{
1697	re_token_t *node = dfa->nodes + nodes->elems[i];
1698	re_token_type_t type = node->type;
1699	unsigned int constraint = node->constraint;
1700
1701	if (type == CHARACTER && !constraint)
1702	continue;
1703	#ifdef RE_ENABLE_I18N
1704	newstate->accept_mb \|= node->accept_mb;
1705	#endif /* RE_ENABLE_I18N */
1706
1707	/* If the state has the halt node, the state is a halt state. */
1708	if (type == END_OF_RE)
1709	newstate->halt = 1;
1710	else if (type == OP_BACK_REF)
1711	newstate->has_backref = 1;
1712
1713	if (constraint)
1714	{
1715	if (newstate->entrance_nodes == &newstate->nodes)
1716	{
1717	newstate->entrance_nodes = re_malloc (re_node_set, 1);
1718	if (BE (newstate->entrance_nodes == NULL, 0))
1719	{
1720	free_state (newstate);
1721	return NULL;
1722	}
1723	if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
1724	!= REG_NOERROR)
1725	return NULL;
1726	nctx_nodes = 0;
1727	newstate->has_constraint = 1;
1728	}
1729
1730	if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1731	{
1732	re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1733	++nctx_nodes;
1734	}
1735	}
1736	}
1737	err = register_state (dfa, newstate, hash);
1738	if (BE (err != REG_NOERROR, 0))
1739	{
1740	free_state (newstate);
1741	newstate = NULL;
1742	}
1743	return newstate;
1744	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: