Context Navigation

texindex.c@ 3043

Visit:

Last change on this file since 3043 was 2619, checked in by bird, 19 years ago
applied OS/2 patches (manually).
File size: 42.4 KB

Line
1	/* texindex -- sort TeX index dribble output into an actual index.
2	$Id: texindex.c,v 1.11 2004/04/11 17:56:47 karl Exp $
3
4	Copyright (C) 1987, 1991, 1992, 1996, 1997, 1998, 1999, 2000, 2001,
5	2002, 2003, 2004 Free Software Foundation, Inc.
6
7	This program is free software; you can redistribute it and/or modify
8	it under the terms of the GNU General Public License as published by
9	the Free Software Foundation; either version 2, or (at your option)
10	any later version.
11
12	This program is distributed in the hope that it will be useful,
13	but WITHOUT ANY WARRANTY; without even the implied warranty of
14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	GNU General Public License for more details.
16
17	You should have received a copy of the GNU General Public License
18	along with this program; if not, write to the Free Software
19	Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307. */
20
21	#include "system.h"
22	#include <getopt.h>
23
24	#ifdef __EMX__
25	# include <stdlib.h>
26	#endif
27
28	static char *program_name = "texindex";
29
30	#if defined (emacs)
31	# include "../src/config.h"
32	/* Some s/os.h files redefine these. */
33	# undef read
34	# undef close
35	# undef write
36	# undef open
37	#endif
38
39	#if !defined (HAVE_MEMSET)
40	#undef memset
41	#define memset(ptr, ignore, count) bzero (ptr, count)
42	#endif
43
44	char mktemp (char );
45
46	#if !defined (SEEK_SET)
47	# define SEEK_SET 0
48	# define SEEK_CUR 1
49	# define SEEK_END 2
50	#endif /* !SEEK_SET */
51
52	struct linebuffer;
53
54	/* When sorting in core, this structure describes one line
55	and the position and length of its first keyfield. */
56	struct lineinfo
57	{
58	char text; / The actual text of the line. */
59	union {
60	char text; / The start of the key (for textual comparison). */
61	long number; /* The numeric value (for numeric comparison). */
62	} key;
63	long keylen; /* Length of KEY field. */
64	};
65
66	/* This structure describes a field to use as a sort key. */
67	struct keyfield
68	{
69	int startwords; /* Number of words to skip. */
70	int startchars; /* Number of additional chars to skip. */
71	int endwords; /* Number of words to ignore at end. */
72	int endchars; /* Ditto for characters of last word. */
73	char ignore_blanks; /* Non-zero means ignore spaces and tabs. */
74	char fold_case; /* Non-zero means case doesn't matter. */
75	char reverse; /* Non-zero means compare in reverse order. */
76	char numeric; /* Non-zeros means field is ASCII numeric. */
77	char positional; /* Sort according to file position. */
78	char braced; /* Count balanced-braced groupings as fields. */
79	};
80
81	/* Vector of keyfields to use. */
82	struct keyfield keyfields[3];
83
84	/* Number of keyfields stored in that vector. */
85	int num_keyfields = 3;
86
87	/* Vector of input file names, terminated with a null pointer. */
88	char **infiles;
89
90	/* Vector of corresponding output file names, or NULL, meaning default it
91	(add an `s' to the end). */
92	char **outfiles;
93
94	/* Length of `infiles'. */
95	int num_infiles;
96
97	/* Pointer to the array of pointers to lines being sorted. */
98	char **linearray;
99
100	/* The allocated length of `linearray'. */
101	long nlines;
102
103	/* Directory to use for temporary files. On Unix, it ends with a slash. */
104	char *tempdir;
105
106	/* Number of last temporary file. */
107	int tempcount;
108
109	/* Number of last temporary file already deleted.
110	Temporary files are deleted by `flush_tempfiles' in order of creation. */
111	int last_deleted_tempcount;
112
113	/* During in-core sort, this points to the base of the data block
114	which contains all the lines of data. */
115	char *text_base;
116
117	/* Initially 0; changed to 1 if we want initials in this index. */
118	int need_initials;
119
120	/* Remembers the first initial letter seen in this index, so we can
121	determine whether we need initials in the sorted form. */
122	char first_initial;
123
124	/* Additional command switches .*/
125
126	/* Nonzero means do not delete tempfiles -- for debugging. */
127	int keep_tempfiles;
128
129	/* Forward declarations of functions in this file. */
130	void decode_command (int argc, char **argv);
131	void sort_in_core (char infile, int total, char outfile);
132	void sort_offline (char infile, off_t total, char outfile);
133	char *parsefile (char filename, char *nextline, char data, long int size);
134	char find_field (struct keyfield keyfield, char str, long int lengthptr);
135	char find_pos (char str, int words, int chars, int ignore_blanks);
136	long find_value (char *start, long int length);
137	char find_braced_pos (char str, int words, int chars, int ignore_blanks);
138	char find_braced_end (char str);
139	void writelines (char *linearray, int nlines, FILE ostream);
140	int compare_field (struct keyfield keyfield, char start1,
141	long int length1, long int pos1, char *start2,
142	long int length2, long int pos2);
143	int compare_full (const void , const void );
144	long readline (struct linebuffer linebuffer, FILE stream);
145	int merge_files (char *infiles, int nfiles, char outfile);
146	int merge_direct (char *infiles, int nfiles, char outfile);
147	void pfatal_with_name (const char *name);
148	void fatal (const char format, const char arg);
149	void error (const char format, const char arg);
150	void xmalloc (), xrealloc ();
151	char concat (char s1, char *s2);
152	void flush_tempfiles (int to_count);
153
154
155	#define MAX_IN_CORE_SORT 500000
156
157	int
158	main (int argc, char **argv)
159	{
160	int i;
161
162	tempcount = 0;
163	last_deleted_tempcount = 0;
164
165	#ifdef __EMX__
166	_response(&argc, &argv);
167	_wildcard(&argc, &argv);
168	#endif
169
170	#ifdef HAVE_SETLOCALE
171	/* Set locale via LC_ALL. */
172	setlocale (LC_ALL, "");
173	#endif
174
175	/* Set the text message domain. */
176	bindtextdomain (PACKAGE, LOCALEDIR);
177	textdomain (PACKAGE);
178
179	/* In case we write to a redirected stdout that fails. */
180	/* not ready atexit (close_stdout); */
181
182	/* Describe the kind of sorting to do. */
183	/* The first keyfield uses the first braced field and folds case. */
184	keyfields[0].braced = 1;
185	keyfields[0].fold_case = 1;
186	keyfields[0].endwords = -1;
187	keyfields[0].endchars = -1;
188
189	/* The second keyfield uses the second braced field, numerically. */
190	keyfields[1].braced = 1;
191	keyfields[1].numeric = 1;
192	keyfields[1].startwords = 1;
193	keyfields[1].endwords = -1;
194	keyfields[1].endchars = -1;
195
196	/* The third keyfield (which is ignored while discarding duplicates)
197	compares the whole line. */
198	keyfields[2].endwords = -1;
199	keyfields[2].endchars = -1;
200
201	decode_command (argc, argv);
202
203	/* Process input files completely, one by one. */
204
205	for (i = 0; i < num_infiles; i++)
206	{
207	int desc;
208	off_t ptr;
209	char *outfile;
210	struct stat instat;
211
212	desc = open (infiles[i], O_RDONLY, 0);
213	if (desc < 0)
214	pfatal_with_name (infiles[i]);
215
216	if (stat (infiles[i], &instat))
217	pfatal_with_name (infiles[i]);
218	if (S_ISDIR (instat.st_mode))
219	{
220	#ifdef EISDIR
221	errno = EISDIR;
222	#endif
223	pfatal_with_name (infiles[i]);
224	}
225
226	lseek (desc, (off_t) 0, SEEK_END);
227	ptr = (off_t) lseek (desc, (off_t) 0, SEEK_CUR);
228
229	close (desc);
230
231	outfile = outfiles[i];
232	if (!outfile)
233	outfile = concat (infiles[i], "s");
234
235	need_initials = 0;
236	first_initial = '\0';
237
238	if (ptr < MAX_IN_CORE_SORT)
239	/* Sort a small amount of data. */
240	sort_in_core (infiles[i], (int)ptr, outfile);
241	else
242	sort_offline (infiles[i], ptr, outfile);
243	}
244
245	flush_tempfiles (tempcount);
246	xexit (0);
247	return 0; /* Avoid bogus warnings. */
248	}
249
250
251	typedef struct
252	{
253	char *long_name;
254	char *short_name;
255	int *variable_ref;
256	int variable_value;
257	char *arg_name;
258	char *doc_string;
259	} TEXINDEX_OPTION;
260
261	TEXINDEX_OPTION texindex_options[] = {
262	{ "--help", "-h", (int )NULL, 0, (char )NULL,
263	N_("display this help and exit") },
264	{ "--keep", "-k", &keep_tempfiles, 1, (char *)NULL,
265	N_("keep temporary files around after processing") },
266	{ "--no-keep", 0, &keep_tempfiles, 0, (char *)NULL,
267	N_("do not keep temporary files around after processing (default)") },
268	{ "--output", "-o", (int *)NULL, 0, "FILE",
269	N_("send output to FILE") },
270	{ "--version", (char )NULL, (int )NULL, 0, (char *)NULL,
271	N_("display version information and exit") },
272	{ (char )NULL, (char )NULL, (int )NULL, 0, (char )NULL }
273	};
274
275	void
276	usage (int result_value)
277	{
278	register int i;
279	FILE *f = result_value ? stderr : stdout;
280
281	fprintf (f, _("Usage: %s [OPTION]... FILE...\n"), program_name);
282	fprintf (f, _("Generate a sorted index for each TeX output FILE.\n"));
283	/* Avoid trigraph nonsense. */
284	fprintf (f,
285	_("Usually FILE... is specified as `foo.%c%c\' for a document `foo.texi'.\n"),
286	'?', '?'); /* avoid trigraph in cat-id-tbl.c */
287	fprintf (f, _("\nOptions:\n"));
288
289	for (i = 0; texindex_options[i].long_name; i++)
290	{
291	putc (' ', f);
292
293	if (texindex_options[i].short_name)
294	fprintf (f, "%s, ", texindex_options[i].short_name);
295
296	fprintf (f, "%s %s",
297	texindex_options[i].long_name,
298	texindex_options[i].arg_name
299	? texindex_options[i].arg_name : "");
300
301	fprintf (f, "\t%s\n", _(texindex_options[i].doc_string));
302	}
303	fputs (_("\n\
304	Email bug reports to bug-texinfo@gnu.org,\n\
305	general questions and discussion to help-texinfo@gnu.org.\n\
306	Texinfo home page: http://www.gnu.org/software/texinfo/"), f);
307	fputs ("\n", f);
308
309	xexit (result_value);
310	}
311
312	/* Decode the command line arguments to set the parameter variables
313	and set up the vector of keyfields and the vector of input files. */
314
315	void
316	decode_command (int argc, char **argv)
317	{
318	int arg_index = 1;
319	char **ip;
320	char **op;
321
322	/* Store default values into parameter variables. */
323
324	tempdir = getenv ("TMPDIR");
325	if (tempdir == NULL)
326	tempdir = getenv ("TEMP");
327	if (tempdir == NULL)
328	tempdir = getenv ("TMP");
329	if (tempdir == NULL)
330	tempdir = DEFAULT_TMPDIR;
331	else
332	tempdir = concat (tempdir, "/");
333
334	keep_tempfiles = 0;
335
336	/* Allocate ARGC input files, which must be enough. */
337
338	infiles = (char *) xmalloc (argc sizeof (char *));
339	outfiles = (char *) xmalloc (argc sizeof (char *));
340	ip = infiles;
341	op = outfiles;
342
343	while (arg_index < argc)
344	{
345	char *arg = argv[arg_index++];
346
347	if (*arg == '-')
348	{
349	if (strcmp (arg, "--version") == 0)
350	{
351	printf ("texindex (GNU %s) %s\n", PACKAGE, VERSION);
352	puts ("");
353	puts ("Copyright (C) 2004 Free Software Foundation, Inc.");
354	printf (_("There is NO warranty. You may redistribute this software\n\
355	under the terms of the GNU General Public License.\n\
356	For more information about these matters, see the files named COPYING.\n"));
357	xexit (0);
358	}
359	else if ((strcmp (arg, "--keep") == 0) \|\|
360	(strcmp (arg, "-k") == 0))
361	{
362	keep_tempfiles = 1;
363	}
364	else if ((strcmp (arg, "--help") == 0) \|\|
365	(strcmp (arg, "-h") == 0))
366	{
367	usage (0);
368	}
369	else if ((strcmp (arg, "--output") == 0) \|\|
370	(strcmp (arg, "-o") == 0))
371	{
372	if (argv[arg_index] != (char *)NULL)
373	{
374	arg_index++;
375	if (op > outfiles)
376	*(op - 1) = argv[arg_index];
377	}
378	else
379	usage (1);
380	}
381	else
382	usage (1);
383	}
384	else
385	{
386	*ip++ = arg;
387	op++ = (char )NULL;
388	}
389	}
390
391	/* Record number of keyfields and terminate list of filenames. */
392	num_infiles = ip - infiles;
393	ip = (char )NULL;
394	if (num_infiles == 0)
395	usage (1);
396	}
397
398
399	/* Return a name for temporary file COUNT. */
400
401	static char *
402	maketempname (int count)
403	{
404	static char *tempbase = NULL;
405	char tempsuffix[10];
406
407	if (!tempbase)
408	{
409	int fd;
410	tempbase = concat (tempdir, "txidxXXXXXX");
411
412	fd = mkstemp (tempbase);
413	if (fd == -1)
414	pfatal_with_name (tempbase);
415	}
416
417	sprintf (tempsuffix, ".%d", count);
418	return concat (tempbase, tempsuffix);
419	}
420
421
422	/* Delete all temporary files up to TO_COUNT. */
423
424	void
425	flush_tempfiles (int to_count)
426	{
427	if (keep_tempfiles)
428	return;
429	while (last_deleted_tempcount < to_count)
430	unlink (maketempname (++last_deleted_tempcount));
431	}
432
433
434
435	/* Compare LINE1 and LINE2 according to the specified set of keyfields. */
436
437	int
438	compare_full (const void p1, const void p2)
439	{
440	char line1 = (char ) p1;
441	char line2 = (char ) p2;
442	int i;
443
444	/* Compare using the first keyfield;
445	if that does not distinguish the lines, try the second keyfield;
446	and so on. */
447
448	for (i = 0; i < num_keyfields; i++)
449	{
450	long length1, length2;
451	char start1 = find_field (&keyfields[i], line1, &length1);
452	char start2 = find_field (&keyfields[i], line2, &length2);
453	int tem = compare_field (&keyfields[i], start1, length1,
454	*line1 - text_base,
455	start2, length2, *line2 - text_base);
456	if (tem)
457	{
458	if (keyfields[i].reverse)
459	return -tem;
460	return tem;
461	}
462	}
463
464	return 0; /* Lines match exactly. */
465	}
466
467	/* Compare LINE1 and LINE2, described by structures
468	in which the first keyfield is identified in advance.
469	For positional sorting, assumes that the order of the lines in core
470	reflects their nominal order. */
471	int
472	compare_prepared (const void p1, const void p2)
473	{
474	struct lineinfo line1 = (struct lineinfo ) p1;
475	struct lineinfo line2 = (struct lineinfo ) p2;
476	int i;
477	int tem;
478	char text1, text2;
479
480	/* Compare using the first keyfield, which has been found for us already. */
481	if (keyfields->positional)
482	{
483	if (line1->text - text_base > line2->text - text_base)
484	tem = 1;
485	else
486	tem = -1;
487	}
488	else if (keyfields->numeric)
489	tem = line1->key.number - line2->key.number;
490	else
491	tem = compare_field (keyfields, line1->key.text, line1->keylen, 0,
492	line2->key.text, line2->keylen, 0);
493	if (tem)
494	{
495	if (keyfields->reverse)
496	return -tem;
497	return tem;
498	}
499
500	text1 = line1->text;
501	text2 = line2->text;
502
503	/* Compare using the second keyfield;
504	if that does not distinguish the lines, try the third keyfield;
505	and so on. */
506
507	for (i = 1; i < num_keyfields; i++)
508	{
509	long length1, length2;
510	char *start1 = find_field (&keyfields[i], text1, &length1);
511	char *start2 = find_field (&keyfields[i], text2, &length2);
512	int tem = compare_field (&keyfields[i], start1, length1,
513	text1 - text_base,
514	start2, length2, text2 - text_base);
515	if (tem)
516	{
517	if (keyfields[i].reverse)
518	return -tem;
519	return tem;
520	}
521	}
522
523	return 0; /* Lines match exactly. */
524	}
525
526	/* Like compare_full but more general.
527	You can pass any strings, and you can say how many keyfields to use.
528	POS1 and POS2 should indicate the nominal positional ordering of
529	the two lines in the input. */
530
531	int
532	compare_general (char str1, char str2, long int pos1, long int pos2, int use_keyfields)
533	{
534	int i;
535
536	/* Compare using the first keyfield;
537	if that does not distinguish the lines, try the second keyfield;
538	and so on. */
539
540	for (i = 0; i < use_keyfields; i++)
541	{
542	long length1, length2;
543	char *start1 = find_field (&keyfields[i], str1, &length1);
544	char *start2 = find_field (&keyfields[i], str2, &length2);
545	int tem = compare_field (&keyfields[i], start1, length1, pos1,
546	start2, length2, pos2);
547	if (tem)
548	{
549	if (keyfields[i].reverse)
550	return -tem;
551	return tem;
552	}
553	}
554
555	return 0; /* Lines match exactly. */
556	}
557
558	/* Find the start and length of a field in STR according to KEYFIELD.
559	A pointer to the starting character is returned, and the length
560	is stored into the int that LENGTHPTR points to. */
561
562	char *
563	find_field (struct keyfield keyfield, char str, long int *lengthptr)
564	{
565	char *start;
566	char *end;
567	char (fun) ();
568
569	if (keyfield->braced)
570	fun = find_braced_pos;
571	else
572	fun = find_pos;
573
574	start = (*fun) (str, keyfield->startwords, keyfield->startchars,
575	keyfield->ignore_blanks);
576	if (keyfield->endwords < 0)
577	{
578	if (keyfield->braced)
579	end = find_braced_end (start);
580	else
581	{
582	end = start;
583	while (end && end != '\n')
584	end++;
585	}
586	}
587	else
588	{
589	end = (*fun) (str, keyfield->endwords, keyfield->endchars, 0);
590	if (end - str < start - str)
591	end = start;
592	}
593	*lengthptr = end - start;
594	return start;
595	}
596
597	/* Return a pointer to a specified place within STR,
598	skipping (from the beginning) WORDS words and then CHARS chars.
599	If IGNORE_BLANKS is nonzero, we skip all blanks
600	after finding the specified word. */
601
602	char *
603	find_pos (char *str, int words, int chars, int ignore_blanks)
604	{
605	int i;
606	char *p = str;
607
608	for (i = 0; i < words; i++)
609	{
610	char c;
611	/* Find next bunch of nonblanks and skip them. */
612	while ((c = *p) == ' ' \|\| c == '\t')
613	p++;
614	while ((c = *p) && c != '\n' && !(c == ' ' \|\| c == '\t'))
615	p++;
616	if (!p \|\| p == '\n')
617	return p;
618	}
619
620	while (p == ' ' \|\| p == '\t')
621	p++;
622
623	for (i = 0; i < chars; i++)
624	{
625	if (!p \|\| p == '\n')
626	break;
627	p++;
628	}
629	return p;
630	}
631
632	/* Like find_pos but assumes that each field is surrounded by braces
633	and that braces within fields are balanced. */
634
635	char *
636	find_braced_pos (char *str, int words, int chars, int ignore_blanks)
637	{
638	int i;
639	int bracelevel;
640	char *p = str;
641	char c;
642
643	for (i = 0; i < words; i++)
644	{
645	bracelevel = 1;
646	while ((c = *p++) != '{' && c != '\n' && c)
647	/* Do nothing. */ ;
648	if (c != '{')
649	return p - 1;
650	while (bracelevel)
651	{
652	c = *p++;
653	if (c == '{')
654	bracelevel++;
655	if (c == '}')
656	bracelevel--;
657	if (c == 0 \|\| c == '\n')
658	return p - 1;
659	}
660	}
661
662	while ((c = *p++) != '{' && c != '\n' && c)
663	/* Do nothing. */ ;
664
665	if (c != '{')
666	return p - 1;
667
668	if (ignore_blanks)
669	while ((c = *p) == ' ' \|\| c == '\t')
670	p++;
671
672	for (i = 0; i < chars; i++)
673	{
674	if (!p \|\| p == '\n')
675	break;
676	p++;
677	}
678	return p;
679	}
680
681	/* Find the end of the balanced-brace field which starts at STR.
682	The position returned is just before the closing brace. */
683
684	char *
685	find_braced_end (char *str)
686	{
687	int bracelevel;
688	char *p = str;
689	char c;
690
691	bracelevel = 1;
692	while (bracelevel)
693	{
694	c = *p++;
695	if (c == '{')
696	bracelevel++;
697	if (c == '}')
698	bracelevel--;
699	if (c == 0 \|\| c == '\n')
700	return p - 1;
701	}
702	return p - 1;
703	}
704
705	long
706	find_value (char *start, long int length)
707	{
708	while (length != 0L)
709	{
710	if (isdigit (*start))
711	return atol (start);
712	length--;
713	start++;
714	}
715	return 0l;
716	}
717
718	/* Vector used to translate characters for comparison.
719	This is how we make all alphanumerics follow all else,
720	and ignore case in the first sorting. */
721	int char_order[256];
722
723	void
724	init_char_order (void)
725	{
726	int i;
727	for (i = 1; i < 256; i++)
728	char_order[i] = i;
729
730	for (i = '0'; i <= '9'; i++)
731	char_order[i] += 512;
732
733	for (i = 'a'; i <= 'z'; i++)
734	{
735	char_order[i] = 512 + i;
736	char_order[i + 'A' - 'a'] = 512 + i;
737	}
738	}
739
740	/* Compare two fields (each specified as a start pointer and a character count)
741	according to KEYFIELD.
742	The sign of the value reports the relation between the fields. */
743
744	int
745	compare_field (struct keyfield keyfield, char start1, long int length1,
746	long int pos1, char *start2, long int length2, long int pos2)
747	{
748	if (keyfields->positional)
749	{
750	if (pos1 > pos2)
751	return 1;
752	else
753	return -1;
754	}
755	if (keyfield->numeric)
756	{
757	long value = find_value (start1, length1) - find_value (start2, length2);
758	if (value > 0)
759	return 1;
760	if (value < 0)
761	return -1;
762	return 0;
763	}
764	else
765	{
766	char *p1 = start1;
767	char *p2 = start2;
768	char *e1 = start1 + length1;
769	char *e2 = start2 + length2;
770
771	while (1)
772	{
773	int c1, c2;
774
775	if (p1 == e1)
776	c1 = 0;
777	else
778	c1 = *p1++;
779	if (p2 == e2)
780	c2 = 0;
781	else
782	c2 = *p2++;
783
784	if (char_order[c1] != char_order[c2])
785	return char_order[c1] - char_order[c2];
786	if (!c1)
787	break;
788	}
789
790	/* Strings are equal except possibly for case. */
791	p1 = start1;
792	p2 = start2;
793	while (1)
794	{
795	int c1, c2;
796
797	if (p1 == e1)
798	c1 = 0;
799	else
800	c1 = *p1++;
801	if (p2 == e2)
802	c2 = 0;
803	else
804	c2 = *p2++;
805
806	if (c1 != c2)
807	/* Reverse sign here so upper case comes out last. */
808	return c2 - c1;
809	if (!c1)
810	break;
811	}
812
813	return 0;
814	}
815	}
816
817
818	/* A `struct linebuffer' is a structure which holds a line of text.
819	`readline' reads a line from a stream into a linebuffer
820	and works regardless of the length of the line. */
821
822	struct linebuffer
823	{
824	long size;
825	char *buffer;
826	};
827
828	/* Initialize LINEBUFFER for use. */
829
830	void
831	initbuffer (struct linebuffer *linebuffer)
832	{
833	linebuffer->size = 200;
834	linebuffer->buffer = (char *) xmalloc (200);
835	}
836
837	/* Read a line of text from STREAM into LINEBUFFER.
838	Return the length of the line. */
839
840	long
841	readline (struct linebuffer linebuffer, FILE stream)
842	{
843	char *buffer = linebuffer->buffer;
844	char *p = linebuffer->buffer;
845	char *end = p + linebuffer->size;
846
847	while (1)
848	{
849	int c = getc (stream);
850	if (p == end)
851	{
852	buffer = (char ) xrealloc (buffer, linebuffer->size = 2);
853	p += buffer - linebuffer->buffer;
854	end += buffer - linebuffer->buffer;
855	linebuffer->buffer = buffer;
856	}
857	if (c < 0 \|\| c == '\n')
858	{
859	*p = 0;
860	break;
861	}
862	*p++ = c;
863	}
864
865	return p - buffer;
866	}
867
868
869	/* Sort an input file too big to sort in core. */
870
871	void
872	sort_offline (char infile, off_t total, char outfile)
873	{
874	/* More than enough. */
875	int ntemps = 2 * (total + MAX_IN_CORE_SORT - 1) / MAX_IN_CORE_SORT;
876	char tempfiles = (char ) xmalloc (ntemps * sizeof (char *));
877	FILE *istream = fopen (infile, "r");
878	int i;
879	struct linebuffer lb;
880	long linelength;
881	int failure = 0;
882
883	initbuffer (&lb);
884
885	/* Read in one line of input data. */
886
887	linelength = readline (&lb, istream);
888
889	if (lb.buffer[0] != '\\' && lb.buffer[0] != '@')
890	{
891	error (_("%s: not a texinfo index file"), infile);
892	return;
893	}
894
895	/* Split up the input into `ntemps' temporary files, or maybe fewer,
896	and put the new files' names into `tempfiles' */
897
898	for (i = 0; i < ntemps; i++)
899	{
900	char *outname = maketempname (++tempcount);
901	FILE *ostream = fopen (outname, "w");
902	long tempsize = 0;
903
904	if (!ostream)
905	pfatal_with_name (outname);
906	tempfiles[i] = outname;
907
908	/* Copy lines into this temp file as long as it does not make file
909	"too big" or until there are no more lines. */
910
911	while (tempsize + linelength + 1 <= MAX_IN_CORE_SORT)
912	{
913	tempsize += linelength + 1;
914	fputs (lb.buffer, ostream);
915	putc ('\n', ostream);
916
917	/* Read another line of input data. */
918
919	linelength = readline (&lb, istream);
920	if (!linelength && feof (istream))
921	break;
922
923	if (lb.buffer[0] != '\\' && lb.buffer[0] != '@')
924	{
925	error (_("%s: not a texinfo index file"), infile);
926	failure = 1;
927	goto fail;
928	}
929	}
930	fclose (ostream);
931	if (feof (istream))
932	break;
933	}
934
935	free (lb.buffer);
936
937	fail:
938	/* Record number of temp files we actually needed. */
939
940	ntemps = i;
941
942	/* Sort each tempfile into another tempfile.
943	Delete the first set of tempfiles and put the names of the second
944	into `tempfiles'. */
945
946	for (i = 0; i < ntemps; i++)
947	{
948	char *newtemp = maketempname (++tempcount);
949	sort_in_core (tempfiles[i], MAX_IN_CORE_SORT, newtemp);
950	if (!keep_tempfiles)
951	unlink (tempfiles[i]);
952	tempfiles[i] = newtemp;
953	}
954
955	if (failure)
956	return;
957
958	/* Merge the tempfiles together and indexify. */
959
960	merge_files (tempfiles, ntemps, outfile);
961	}
962
963
964	/* Sort INFILE, whose size is TOTAL,
965	assuming that is small enough to be done in-core,
966	then indexify it and send the output to OUTFILE (or to stdout). */
967
968	void
969	sort_in_core (char infile, int total, char outfile)
970	{
971	char **nextline;
972	char data = (char ) xmalloc (total + 1);
973	char *file_data;
974	long file_size;
975	int i;
976	FILE *ostream = stdout;
977	struct lineinfo *lineinfo;
978
979	/* Read the contents of the file into the moby array `data'. */
980
981	int desc = open (infile, O_RDONLY, 0);
982
983	if (desc < 0)
984	fatal (_("failure reopening %s"), infile);
985	for (file_size = 0;;)
986	{
987	i = read (desc, data + file_size, total - file_size);
988	if (i <= 0)
989	break;
990	file_size += i;
991	}
992	file_data = data;
993	data[file_size] = 0;
994
995	close (desc);
996
997	if (file_size > 0 && data[0] != '\\' && data[0] != '@')
998	{
999	error (_("%s: not a texinfo index file"), infile);
1000	return;
1001	}
1002
1003	init_char_order ();
1004
1005	/* Sort routines want to know this address. */
1006
1007	text_base = data;
1008
1009	/* Create the array of pointers to lines, with a default size
1010	frequently enough. */
1011
1012	nlines = total / 50;
1013	if (!nlines)
1014	nlines = 2;
1015	linearray = (char *) xmalloc (nlines sizeof (char *));
1016
1017	/* `nextline' points to the next free slot in this array.
1018	`nlines' is the allocated size. */
1019
1020	nextline = linearray;
1021
1022	/* Parse the input file's data, and make entries for the lines. */
1023
1024	nextline = parsefile (infile, nextline, file_data, file_size);
1025	if (nextline == 0)
1026	{
1027	error (_("%s: not a texinfo index file"), infile);
1028	return;
1029	}
1030
1031	/* Sort the lines. */
1032
1033	/* If we have enough space, find the first keyfield of each line in advance.
1034	Make a `struct lineinfo' for each line, which records the keyfield
1035	as well as the line, and sort them. */
1036
1037	lineinfo = malloc ((nextline - linearray) * sizeof (struct lineinfo));
1038
1039	if (lineinfo)
1040	{
1041	struct lineinfo *lp;
1042	char **p;
1043
1044	for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
1045	{
1046	lp->text = *p;
1047	lp->key.text = find_field (keyfields, *p, &lp->keylen);
1048	if (keyfields->numeric)
1049	lp->key.number = find_value (lp->key.text, lp->keylen);
1050	}
1051
1052	qsort (lineinfo, nextline - linearray, sizeof (struct lineinfo),
1053	compare_prepared);
1054
1055	for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
1056	*p = lp->text;
1057
1058	free (lineinfo);
1059	}
1060	else
1061	qsort (linearray, nextline - linearray, sizeof (char *), compare_full);
1062
1063	/* Open the output file. */
1064
1065	if (outfile)
1066	{
1067	ostream = fopen (outfile, "w");
1068	if (!ostream)
1069	pfatal_with_name (outfile);
1070	}
1071
1072	writelines (linearray, nextline - linearray, ostream);
1073	if (outfile)
1074	fclose (ostream);
1075
1076	free (linearray);
1077	free (data);
1078	}
1079
1080
1081	/* Parse an input string in core into lines.
1082	DATA is the input string, and SIZE is its length.
1083	Data goes in LINEARRAY starting at NEXTLINE.
1084	The value returned is the first entry in LINEARRAY still unused.
1085	Value 0 means input file contents are invalid. */
1086
1087	char **
1088	parsefile (char filename, char nextline, char data, long int size)
1089	{
1090	char p, end;
1091	char **line = nextline;
1092
1093	p = data;
1094	end = p + size;
1095	*end = 0;
1096
1097	while (p != end)
1098	{
1099	if (p[0] != '\\' && p[0] != '@')
1100	return 0;
1101
1102	*line = p;
1103
1104	/* Find the first letter of the first field of this line. If it
1105	is different from the first letter of the first field of the
1106	first line, we need initial headers in the output index. */
1107	while (p && p != '{')
1108	p++;
1109	if (p == end)
1110	return 0;
1111	p++;
1112	if (first_initial)
1113	{
1114	if (first_initial != toupper (*p))
1115	need_initials = 1;
1116	}
1117	else
1118	first_initial = toupper (*p);
1119
1120	while (p && p != '\n')
1121	p++;
1122	if (p != end)
1123	p++;
1124
1125	line++;
1126	if (line == linearray + nlines)
1127	{
1128	char **old = linearray;
1129	linearray = xrealloc (linearray, sizeof (char ) (nlines *= 4));
1130	line += linearray - old;
1131	}
1132	}
1133
1134	return line;
1135	}
1136
1137
1138	/* Indexification is a filter applied to the sorted lines
1139	as they are being written to the output file.
1140	Multiple entries for the same name, with different page numbers,
1141	get combined into a single entry with multiple page numbers.
1142	The first braced field, which is used for sorting, is discarded.
1143	However, its first character is examined, folded to lower case,
1144	and if it is different from that in the previous line fed to us
1145	a \initial line is written with one argument, the new initial.
1146
1147	If an entry has four braced fields, then the second and third
1148	constitute primary and secondary names.
1149	In this case, each change of primary name
1150	generates a \primary line which contains only the primary name,
1151	and in between these are \secondary lines which contain
1152	just a secondary name and page numbers. */
1153
1154	/* The last primary name we wrote a \primary entry for.
1155	If only one level of indexing is being done, this is the last name seen. */
1156	char *lastprimary;
1157	/* Length of storage allocated for lastprimary. */
1158	int lastprimarylength;
1159
1160	/* Similar, for the secondary name. */
1161	char *lastsecondary;
1162	int lastsecondarylength;
1163
1164	/* Zero if we are not in the middle of writing an entry.
1165	One if we have written the beginning of an entry but have not
1166	yet written any page numbers into it.
1167	Greater than one if we have written the beginning of an entry
1168	plus at least one page number. */
1169	int pending;
1170
1171	/* The initial (for sorting purposes) of the last primary entry written.
1172	When this changes, a \initial {c} line is written */
1173
1174	char *lastinitial;
1175
1176	int lastinitiallength;
1177
1178	/* When we need a string of length 1 for the value of lastinitial,
1179	store it here. */
1180
1181	char lastinitial1[2];
1182
1183	/* Initialize static storage for writing an index. */
1184
1185	void
1186	init_index (void)
1187	{
1188	pending = 0;
1189	lastinitial = lastinitial1;
1190	lastinitial1[0] = 0;
1191	lastinitial1[1] = 0;
1192	lastinitiallength = 0;
1193	lastprimarylength = 100;
1194	lastprimary = (char *) xmalloc (lastprimarylength + 1);
1195	memset (lastprimary, '\0', lastprimarylength + 1);
1196	lastsecondarylength = 100;
1197	lastsecondary = (char *) xmalloc (lastsecondarylength + 1);
1198	memset (lastsecondary, '\0', lastsecondarylength + 1);
1199	}
1200
1201	/* Indexify. Merge entries for the same name,
1202	insert headers for each initial character, etc. */
1203
1204	void
1205	indexify (char line, FILE ostream)
1206	{
1207	char primary, secondary, *pagenumber;
1208	int primarylength, secondarylength = 0, pagelength;
1209	int nosecondary;
1210	int initiallength;
1211	char *initial;
1212	char initial1[2];
1213	register char *p;
1214
1215	/* First, analyze the parts of the entry fed to us this time. */
1216
1217	p = find_braced_pos (line, 0, 0, 0);
1218	if (*p == '{')
1219	{
1220	initial = p;
1221	/* Get length of inner pair of braces starting at `p',
1222	including that inner pair of braces. */
1223	initiallength = find_braced_end (p + 1) + 1 - p;
1224	}
1225	else
1226	{
1227	initial = initial1;
1228	initial1[0] = toupper (*p);
1229	initial1[1] = 0;
1230	initiallength = 1;
1231	}
1232
1233	pagenumber = find_braced_pos (line, 1, 0, 0);
1234	pagelength = find_braced_end (pagenumber) - pagenumber;
1235	if (pagelength == 0)
1236	fatal (_("No page number in %s"), line);
1237
1238	primary = find_braced_pos (line, 2, 0, 0);
1239	primarylength = find_braced_end (primary) - primary;
1240
1241	secondary = find_braced_pos (line, 3, 0, 0);
1242	nosecondary = !*secondary;
1243	if (!nosecondary)
1244	secondarylength = find_braced_end (secondary) - secondary;
1245
1246	/* If the primary is different from before, make a new primary entry. */
1247	if (strncmp (primary, lastprimary, primarylength))
1248	{
1249	/* Close off current secondary entry first, if one is open. */
1250	if (pending)
1251	{
1252	fputs ("}\n", ostream);
1253	pending = 0;
1254	}
1255
1256	/* If this primary has a different initial, include an entry for
1257	the initial. */
1258	if (need_initials &&
1259	(initiallength != lastinitiallength \|\|
1260	strncmp (initial, lastinitial, initiallength)))
1261	{
1262	fprintf (ostream, "\\initial {");
1263	fwrite (initial, 1, initiallength, ostream);
1264	fputs ("}\n", ostream);
1265	if (initial == initial1)
1266	{
1267	lastinitial = lastinitial1;
1268	lastinitial1 = initial1;
1269	}
1270	else
1271	{
1272	lastinitial = initial;
1273	}
1274	lastinitiallength = initiallength;
1275	}
1276
1277	/* Make the entry for the primary. */
1278	if (nosecondary)
1279	fputs ("\\entry {", ostream);
1280	else
1281	fputs ("\\primary {", ostream);
1282	fwrite (primary, primarylength, 1, ostream);
1283	if (nosecondary)
1284	{
1285	fputs ("}{", ostream);
1286	pending = 1;
1287	}
1288	else
1289	fputs ("}\n", ostream);
1290
1291	/* Record name of most recent primary. */
1292	if (lastprimarylength < primarylength)
1293	{
1294	lastprimarylength = primarylength + 100;
1295	lastprimary = (char *) xrealloc (lastprimary,
1296	1 + lastprimarylength);
1297	}
1298	strncpy (lastprimary, primary, primarylength);
1299	lastprimary[primarylength] = 0;
1300
1301	/* There is no current secondary within this primary, now. */
1302	lastsecondary[0] = 0;
1303	}
1304
1305	/* Should not have an entry with no subtopic following one with a
1306	subtopic. */
1307
1308	if (nosecondary && *lastsecondary)
1309	error (_("entry %s follows an entry with a secondary name"), line);
1310
1311	/* Start a new secondary entry if necessary. */
1312	if (!nosecondary && strncmp (secondary, lastsecondary, secondarylength))
1313	{
1314	if (pending)
1315	{
1316	fputs ("}\n", ostream);
1317	pending = 0;
1318	}
1319
1320	/* Write the entry for the secondary. */
1321	fputs ("\\secondary {", ostream);
1322	fwrite (secondary, secondarylength, 1, ostream);
1323	fputs ("}{", ostream);
1324	pending = 1;
1325
1326	/* Record name of most recent secondary. */
1327	if (lastsecondarylength < secondarylength)
1328	{
1329	lastsecondarylength = secondarylength + 100;
1330	lastsecondary = (char *) xrealloc (lastsecondary,
1331	1 + lastsecondarylength);
1332	}
1333	strncpy (lastsecondary, secondary, secondarylength);
1334	lastsecondary[secondarylength] = 0;
1335	}
1336
1337	/* Here to add one more page number to the current entry. */
1338	if (pending++ != 1)
1339	fputs (", ", ostream); /* Punctuate first, if this is not the first. */
1340	fwrite (pagenumber, pagelength, 1, ostream);
1341	}
1342
1343	/* Close out any unfinished output entry. */
1344
1345	void
1346	finish_index (FILE *ostream)
1347	{
1348	if (pending)
1349	fputs ("}\n", ostream);
1350	free (lastprimary);
1351	free (lastsecondary);
1352	}
1353
1354
1355	/* Copy the lines in the sorted order.
1356	Each line is copied out of the input file it was found in. */
1357
1358	void
1359	writelines (char *linearray, int nlines, FILE ostream)
1360	{
1361	char **stop_line = linearray + nlines;
1362	char **next_line;
1363
1364	init_index ();
1365
1366	/* Output the text of the lines, and free the buffer space. */
1367
1368	for (next_line = linearray; next_line != stop_line; next_line++)
1369	{
1370	/* If -u was specified, output the line only if distinct from
1371	previous one. */
1372	if (next_line == linearray
1373	/* Compare previous line with this one, using only the
1374	explicitly specd keyfields. */
1375	\|\| compare_general ((next_line - 1), next_line, 0L, 0L,
1376	num_keyfields - 1))
1377	{
1378	char p = next_line;
1379	char c;
1380
1381	while ((c = *p++) && c != '\n')
1382	/* Do nothing. */ ;
1383	*(p - 1) = 0;
1384	indexify (*next_line, ostream);
1385	}
1386	}
1387
1388	finish_index (ostream);
1389	}
1390
1391
1392	/* Assume (and optionally verify) that each input file is sorted;
1393	merge them and output the result.
1394	Returns nonzero if any input file fails to be sorted.
1395
1396	This is the high-level interface that can handle an unlimited
1397	number of files. */
1398
1399	#define MAX_DIRECT_MERGE 10
1400
1401	int
1402	merge_files (char *infiles, int nfiles, char outfile)
1403	{
1404	char **tempfiles;
1405	int ntemps;
1406	int i;
1407	int value = 0;
1408	int start_tempcount = tempcount;
1409
1410	if (nfiles <= MAX_DIRECT_MERGE)
1411	return merge_direct (infiles, nfiles, outfile);
1412
1413	/* Merge groups of MAX_DIRECT_MERGE input files at a time,
1414	making a temporary file to hold each group's result. */
1415
1416	ntemps = (nfiles + MAX_DIRECT_MERGE - 1) / MAX_DIRECT_MERGE;
1417	tempfiles = (char *) xmalloc (ntemps sizeof (char *));
1418	for (i = 0; i < ntemps; i++)
1419	{
1420	int nf = MAX_DIRECT_MERGE;
1421	if (i + 1 == ntemps)
1422	nf = nfiles - i * MAX_DIRECT_MERGE;
1423	tempfiles[i] = maketempname (++tempcount);
1424	value \|= merge_direct (&infiles[i * MAX_DIRECT_MERGE], nf, tempfiles[i]);
1425	}
1426
1427	/* All temporary files that existed before are no longer needed
1428	since their contents have been merged into our new tempfiles.
1429	So delete them. */
1430	flush_tempfiles (start_tempcount);
1431
1432	/* Now merge the temporary files we created. */
1433
1434	merge_files (tempfiles, ntemps, outfile);
1435
1436	free (tempfiles);
1437
1438	return value;
1439	}
1440
1441
1442	/* Assume (and optionally verify) that each input file is sorted;
1443	merge them and output the result.
1444	Returns nonzero if any input file fails to be sorted.
1445
1446	This version of merging will not work if the number of
1447	input files gets too high. Higher level functions
1448	use it only with a bounded number of input files. */
1449
1450	int
1451	merge_direct (char *infiles, int nfiles, char outfile)
1452	{
1453	struct linebuffer lb1, lb2;
1454	struct linebuffer thisline, prevline;
1455	FILE **streams;
1456	int i;
1457	int nleft;
1458	int lossage = 0;
1459	int *file_lossage;
1460	struct linebuffer *prev_out = 0;
1461	FILE *ostream = stdout;
1462
1463	if (outfile)
1464	{
1465	ostream = fopen (outfile, "w");
1466	}
1467	if (!ostream)
1468	pfatal_with_name (outfile);
1469
1470	init_index ();
1471
1472	if (nfiles == 0)
1473	{
1474	if (outfile)
1475	fclose (ostream);
1476	return 0;
1477	}
1478
1479	/* For each file, make two line buffers. Also, for each file, there
1480	is an element of `thisline' which points at any time to one of the
1481	file's two buffers, and an element of `prevline' which points to
1482	the other buffer. `thisline' is supposed to point to the next
1483	available line from the file, while `prevline' holds the last file
1484	line used, which is remembered so that we can verify that the file
1485	is properly sorted. */
1486
1487	/* lb1 and lb2 contain one buffer each per file. */
1488	lb1 = (struct linebuffer ) xmalloc (nfiles sizeof (struct linebuffer));
1489	lb2 = (struct linebuffer ) xmalloc (nfiles sizeof (struct linebuffer));
1490
1491	/* thisline[i] points to the linebuffer holding the next available
1492	line in file i, or is zero if there are no lines left in that file. */
1493	thisline = (struct linebuffer **)
1494	xmalloc (nfiles * sizeof (struct linebuffer *));
1495	/* prevline[i] points to the linebuffer holding the last used line
1496	from file i. This is just for verifying that file i is properly
1497	sorted. */
1498	prevline = (struct linebuffer **)
1499	xmalloc (nfiles * sizeof (struct linebuffer *));
1500	/* streams[i] holds the input stream for file i. */
1501	streams = (FILE *) xmalloc (nfiles sizeof (FILE *));
1502	/* file_lossage[i] is nonzero if we already know file i is not
1503	properly sorted. */
1504	file_lossage = (int ) xmalloc (nfiles sizeof (int));
1505
1506	/* Allocate and initialize all that storage. */
1507
1508	for (i = 0; i < nfiles; i++)
1509	{
1510	initbuffer (&lb1[i]);
1511	initbuffer (&lb2[i]);
1512	thisline[i] = &lb1[i];
1513	prevline[i] = &lb2[i];
1514	file_lossage[i] = 0;
1515	streams[i] = fopen (infiles[i], "r");
1516	if (!streams[i])
1517	pfatal_with_name (infiles[i]);
1518
1519	readline (thisline[i], streams[i]);
1520	}
1521
1522	/* Keep count of number of files not at eof. */
1523	nleft = nfiles;
1524
1525	while (nleft)
1526	{
1527	struct linebuffer *best = 0;
1528	struct linebuffer *exch;
1529	int bestfile = -1;
1530	int i;
1531
1532	/* Look at the next avail line of each file; choose the least one. */
1533
1534	for (i = 0; i < nfiles; i++)
1535	{
1536	if (thisline[i] &&
1537	(!best \|\|
1538	0 < compare_general (best->buffer, thisline[i]->buffer,
1539	(long) bestfile, (long) i, num_keyfields)))
1540	{
1541	best = thisline[i];
1542	bestfile = i;
1543	}
1544	}
1545
1546	/* Output that line, unless it matches the previous one and we
1547	don't want duplicates. */
1548
1549	if (!(prev_out &&
1550	!compare_general (prev_out->buffer,
1551	best->buffer, 0L, 1L, num_keyfields - 1)))
1552	indexify (best->buffer, ostream);
1553	prev_out = best;
1554
1555	/* Now make the line the previous of its file, and fetch a new
1556	line from that file. */
1557
1558	exch = prevline[bestfile];
1559	prevline[bestfile] = thisline[bestfile];
1560	thisline[bestfile] = exch;
1561
1562	while (1)
1563	{
1564	/* If the file has no more, mark it empty. */
1565
1566	if (feof (streams[bestfile]))
1567	{
1568	thisline[bestfile] = 0;
1569	/* Update the number of files still not empty. */
1570	nleft--;
1571	break;
1572	}
1573	readline (thisline[bestfile], streams[bestfile]);
1574	if (thisline[bestfile]->buffer[0] \|\| !feof (streams[bestfile]))
1575	break;
1576	}
1577	}
1578
1579	finish_index (ostream);
1580
1581	/* Free all storage and close all input streams. */
1582
1583	for (i = 0; i < nfiles; i++)
1584	{
1585	fclose (streams[i]);
1586	free (lb1[i].buffer);
1587	free (lb2[i].buffer);
1588	}
1589	free (file_lossage);
1590	free (lb1);
1591	free (lb2);
1592	free (thisline);
1593	free (prevline);
1594	free (streams);
1595
1596	if (outfile)
1597	fclose (ostream);
1598
1599	return lossage;
1600	}
1601
1602
1603	/* Print error message and exit. */
1604
1605	void
1606	fatal (const char format, const char arg)
1607	{
1608	error (format, arg);
1609	xexit (1);
1610	}
1611
1612	/* Print error message. FORMAT is printf control string, ARG is arg for it. */
1613	void
1614	error (const char format, const char arg)
1615	{
1616	printf ("%s: ", program_name);
1617	printf (format, arg);
1618	if (format[strlen (format) -1] != '\n')
1619	printf ("\n");
1620	}
1621
1622	void
1623	perror_with_name (const char *name)
1624	{
1625	fprintf (stderr, "%s: ", program_name);
1626	perror (name);
1627	}
1628
1629	void
1630	pfatal_with_name (const char *name)
1631	{
1632	perror_with_name (name);
1633	xexit (1);
1634	}
1635
1636
1637
1638	/* Return a newly-allocated string concatenating S1 and S2. */
1639
1640	char *
1641	concat (char s1, char s2)
1642	{
1643	int len1 = strlen (s1), len2 = strlen (s2);
1644	char result = (char ) xmalloc (len1 + len2 + 1);
1645
1646	strcpy (result, s1);
1647	strcpy (result + len1, s2);
1648	*(result + len1 + len2) = 0;
1649
1650	return result;
1651	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/sys-apps/texinfo/util/texindex.c@ 3043

Download in other formats: