source: trunk/essentials/sys-apps/texinfo/util/texindex.c@ 3043

Last change on this file since 3043 was 2619, checked in by bird, 19 years ago

applied OS/2 patches (manually).

File size: 42.4 KB
Line 
1/* texindex -- sort TeX index dribble output into an actual index.
2 $Id: texindex.c,v 1.11 2004/04/11 17:56:47 karl Exp $
3
4 Copyright (C) 1987, 1991, 1992, 1996, 1997, 1998, 1999, 2000, 2001,
5 2002, 2003, 2004 Free Software Foundation, Inc.
6
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 2, or (at your option)
10 any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307. */
20
21#include "system.h"
22#include <getopt.h>
23
24#ifdef __EMX__
25# include <stdlib.h>
26#endif
27
28static char *program_name = "texindex";
29
30#if defined (emacs)
31# include "../src/config.h"
32/* Some s/os.h files redefine these. */
33# undef read
34# undef close
35# undef write
36# undef open
37#endif
38
39#if !defined (HAVE_MEMSET)
40#undef memset
41#define memset(ptr, ignore, count) bzero (ptr, count)
42#endif
43
44char *mktemp (char *);
45
46#if !defined (SEEK_SET)
47# define SEEK_SET 0
48# define SEEK_CUR 1
49# define SEEK_END 2
50#endif /* !SEEK_SET */
51
52struct linebuffer;
53
54/* When sorting in core, this structure describes one line
55 and the position and length of its first keyfield. */
56struct lineinfo
57{
58 char *text; /* The actual text of the line. */
59 union {
60 char *text; /* The start of the key (for textual comparison). */
61 long number; /* The numeric value (for numeric comparison). */
62 } key;
63 long keylen; /* Length of KEY field. */
64};
65
66/* This structure describes a field to use as a sort key. */
67struct keyfield
68{
69 int startwords; /* Number of words to skip. */
70 int startchars; /* Number of additional chars to skip. */
71 int endwords; /* Number of words to ignore at end. */
72 int endchars; /* Ditto for characters of last word. */
73 char ignore_blanks; /* Non-zero means ignore spaces and tabs. */
74 char fold_case; /* Non-zero means case doesn't matter. */
75 char reverse; /* Non-zero means compare in reverse order. */
76 char numeric; /* Non-zeros means field is ASCII numeric. */
77 char positional; /* Sort according to file position. */
78 char braced; /* Count balanced-braced groupings as fields. */
79};
80
81/* Vector of keyfields to use. */
82struct keyfield keyfields[3];
83
84/* Number of keyfields stored in that vector. */
85int num_keyfields = 3;
86
87/* Vector of input file names, terminated with a null pointer. */
88char **infiles;
89
90/* Vector of corresponding output file names, or NULL, meaning default it
91 (add an `s' to the end). */
92char **outfiles;
93
94/* Length of `infiles'. */
95int num_infiles;
96
97/* Pointer to the array of pointers to lines being sorted. */
98char **linearray;
99
100/* The allocated length of `linearray'. */
101long nlines;
102
103/* Directory to use for temporary files. On Unix, it ends with a slash. */
104char *tempdir;
105
106/* Number of last temporary file. */
107int tempcount;
108
109/* Number of last temporary file already deleted.
110 Temporary files are deleted by `flush_tempfiles' in order of creation. */
111int last_deleted_tempcount;
112
113/* During in-core sort, this points to the base of the data block
114 which contains all the lines of data. */
115char *text_base;
116
117/* Initially 0; changed to 1 if we want initials in this index. */
118int need_initials;
119
120/* Remembers the first initial letter seen in this index, so we can
121 determine whether we need initials in the sorted form. */
122char first_initial;
123
124/* Additional command switches .*/
125
126/* Nonzero means do not delete tempfiles -- for debugging. */
127int keep_tempfiles;
128
129/* Forward declarations of functions in this file. */
130void decode_command (int argc, char **argv);
131void sort_in_core (char *infile, int total, char *outfile);
132void sort_offline (char *infile, off_t total, char *outfile);
133char **parsefile (char *filename, char **nextline, char *data, long int size);
134char *find_field (struct keyfield *keyfield, char *str, long int *lengthptr);
135char *find_pos (char *str, int words, int chars, int ignore_blanks);
136long find_value (char *start, long int length);
137char *find_braced_pos (char *str, int words, int chars, int ignore_blanks);
138char *find_braced_end (char *str);
139void writelines (char **linearray, int nlines, FILE *ostream);
140int compare_field (struct keyfield *keyfield, char *start1,
141 long int length1, long int pos1, char *start2,
142 long int length2, long int pos2);
143int compare_full (const void *, const void *);
144long readline (struct linebuffer *linebuffer, FILE *stream);
145int merge_files (char **infiles, int nfiles, char *outfile);
146int merge_direct (char **infiles, int nfiles, char *outfile);
147void pfatal_with_name (const char *name);
148void fatal (const char *format, const char *arg);
149void error (const char *format, const char *arg);
150void *xmalloc (), *xrealloc ();
151char *concat (char *s1, char *s2);
152void flush_tempfiles (int to_count);
153
154
155#define MAX_IN_CORE_SORT 500000
156
157int
158main (int argc, char **argv)
159{
160 int i;
161
162 tempcount = 0;
163 last_deleted_tempcount = 0;
164
165#ifdef __EMX__
166 _response(&argc, &argv);
167 _wildcard(&argc, &argv);
168#endif
169
170#ifdef HAVE_SETLOCALE
171 /* Set locale via LC_ALL. */
172 setlocale (LC_ALL, "");
173#endif
174
175 /* Set the text message domain. */
176 bindtextdomain (PACKAGE, LOCALEDIR);
177 textdomain (PACKAGE);
178
179 /* In case we write to a redirected stdout that fails. */
180 /* not ready atexit (close_stdout); */
181
182 /* Describe the kind of sorting to do. */
183 /* The first keyfield uses the first braced field and folds case. */
184 keyfields[0].braced = 1;
185 keyfields[0].fold_case = 1;
186 keyfields[0].endwords = -1;
187 keyfields[0].endchars = -1;
188
189 /* The second keyfield uses the second braced field, numerically. */
190 keyfields[1].braced = 1;
191 keyfields[1].numeric = 1;
192 keyfields[1].startwords = 1;
193 keyfields[1].endwords = -1;
194 keyfields[1].endchars = -1;
195
196 /* The third keyfield (which is ignored while discarding duplicates)
197 compares the whole line. */
198 keyfields[2].endwords = -1;
199 keyfields[2].endchars = -1;
200
201 decode_command (argc, argv);
202
203 /* Process input files completely, one by one. */
204
205 for (i = 0; i < num_infiles; i++)
206 {
207 int desc;
208 off_t ptr;
209 char *outfile;
210 struct stat instat;
211
212 desc = open (infiles[i], O_RDONLY, 0);
213 if (desc < 0)
214 pfatal_with_name (infiles[i]);
215
216 if (stat (infiles[i], &instat))
217 pfatal_with_name (infiles[i]);
218 if (S_ISDIR (instat.st_mode))
219 {
220#ifdef EISDIR
221 errno = EISDIR;
222#endif
223 pfatal_with_name (infiles[i]);
224 }
225
226 lseek (desc, (off_t) 0, SEEK_END);
227 ptr = (off_t) lseek (desc, (off_t) 0, SEEK_CUR);
228
229 close (desc);
230
231 outfile = outfiles[i];
232 if (!outfile)
233 outfile = concat (infiles[i], "s");
234
235 need_initials = 0;
236 first_initial = '\0';
237
238 if (ptr < MAX_IN_CORE_SORT)
239 /* Sort a small amount of data. */
240 sort_in_core (infiles[i], (int)ptr, outfile);
241 else
242 sort_offline (infiles[i], ptr, outfile);
243 }
244
245 flush_tempfiles (tempcount);
246 xexit (0);
247 return 0; /* Avoid bogus warnings. */
248}
249
250
251typedef struct
252{
253 char *long_name;
254 char *short_name;
255 int *variable_ref;
256 int variable_value;
257 char *arg_name;
258 char *doc_string;
259} TEXINDEX_OPTION;
260
261TEXINDEX_OPTION texindex_options[] = {
262 { "--help", "-h", (int *)NULL, 0, (char *)NULL,
263 N_("display this help and exit") },
264 { "--keep", "-k", &keep_tempfiles, 1, (char *)NULL,
265 N_("keep temporary files around after processing") },
266 { "--no-keep", 0, &keep_tempfiles, 0, (char *)NULL,
267 N_("do not keep temporary files around after processing (default)") },
268 { "--output", "-o", (int *)NULL, 0, "FILE",
269 N_("send output to FILE") },
270 { "--version", (char *)NULL, (int *)NULL, 0, (char *)NULL,
271 N_("display version information and exit") },
272 { (char *)NULL, (char *)NULL, (int *)NULL, 0, (char *)NULL }
273};
274
275void
276usage (int result_value)
277{
278 register int i;
279 FILE *f = result_value ? stderr : stdout;
280
281 fprintf (f, _("Usage: %s [OPTION]... FILE...\n"), program_name);
282 fprintf (f, _("Generate a sorted index for each TeX output FILE.\n"));
283 /* Avoid trigraph nonsense. */
284 fprintf (f,
285_("Usually FILE... is specified as `foo.%c%c\' for a document `foo.texi'.\n"),
286 '?', '?'); /* avoid trigraph in cat-id-tbl.c */
287 fprintf (f, _("\nOptions:\n"));
288
289 for (i = 0; texindex_options[i].long_name; i++)
290 {
291 putc (' ', f);
292
293 if (texindex_options[i].short_name)
294 fprintf (f, "%s, ", texindex_options[i].short_name);
295
296 fprintf (f, "%s %s",
297 texindex_options[i].long_name,
298 texindex_options[i].arg_name
299 ? texindex_options[i].arg_name : "");
300
301 fprintf (f, "\t%s\n", _(texindex_options[i].doc_string));
302 }
303 fputs (_("\n\
304Email bug reports to bug-texinfo@gnu.org,\n\
305general questions and discussion to help-texinfo@gnu.org.\n\
306Texinfo home page: http://www.gnu.org/software/texinfo/"), f);
307 fputs ("\n", f);
308
309 xexit (result_value);
310}
311
312/* Decode the command line arguments to set the parameter variables
313 and set up the vector of keyfields and the vector of input files. */
314
315void
316decode_command (int argc, char **argv)
317{
318 int arg_index = 1;
319 char **ip;
320 char **op;
321
322 /* Store default values into parameter variables. */
323
324 tempdir = getenv ("TMPDIR");
325 if (tempdir == NULL)
326 tempdir = getenv ("TEMP");
327 if (tempdir == NULL)
328 tempdir = getenv ("TMP");
329 if (tempdir == NULL)
330 tempdir = DEFAULT_TMPDIR;
331 else
332 tempdir = concat (tempdir, "/");
333
334 keep_tempfiles = 0;
335
336 /* Allocate ARGC input files, which must be enough. */
337
338 infiles = (char **) xmalloc (argc * sizeof (char *));
339 outfiles = (char **) xmalloc (argc * sizeof (char *));
340 ip = infiles;
341 op = outfiles;
342
343 while (arg_index < argc)
344 {
345 char *arg = argv[arg_index++];
346
347 if (*arg == '-')
348 {
349 if (strcmp (arg, "--version") == 0)
350 {
351 printf ("texindex (GNU %s) %s\n", PACKAGE, VERSION);
352 puts ("");
353 puts ("Copyright (C) 2004 Free Software Foundation, Inc.");
354 printf (_("There is NO warranty. You may redistribute this software\n\
355under the terms of the GNU General Public License.\n\
356For more information about these matters, see the files named COPYING.\n"));
357 xexit (0);
358 }
359 else if ((strcmp (arg, "--keep") == 0) ||
360 (strcmp (arg, "-k") == 0))
361 {
362 keep_tempfiles = 1;
363 }
364 else if ((strcmp (arg, "--help") == 0) ||
365 (strcmp (arg, "-h") == 0))
366 {
367 usage (0);
368 }
369 else if ((strcmp (arg, "--output") == 0) ||
370 (strcmp (arg, "-o") == 0))
371 {
372 if (argv[arg_index] != (char *)NULL)
373 {
374 arg_index++;
375 if (op > outfiles)
376 *(op - 1) = argv[arg_index];
377 }
378 else
379 usage (1);
380 }
381 else
382 usage (1);
383 }
384 else
385 {
386 *ip++ = arg;
387 *op++ = (char *)NULL;
388 }
389 }
390
391 /* Record number of keyfields and terminate list of filenames. */
392 num_infiles = ip - infiles;
393 *ip = (char *)NULL;
394 if (num_infiles == 0)
395 usage (1);
396}
397
398
399/* Return a name for temporary file COUNT. */
400
401static char *
402maketempname (int count)
403{
404 static char *tempbase = NULL;
405 char tempsuffix[10];
406
407 if (!tempbase)
408 {
409 int fd;
410 tempbase = concat (tempdir, "txidxXXXXXX");
411
412 fd = mkstemp (tempbase);
413 if (fd == -1)
414 pfatal_with_name (tempbase);
415 }
416
417 sprintf (tempsuffix, ".%d", count);
418 return concat (tempbase, tempsuffix);
419}
420
421
422/* Delete all temporary files up to TO_COUNT. */
423
424void
425flush_tempfiles (int to_count)
426{
427 if (keep_tempfiles)
428 return;
429 while (last_deleted_tempcount < to_count)
430 unlink (maketempname (++last_deleted_tempcount));
431}
432
433
434
435/* Compare LINE1 and LINE2 according to the specified set of keyfields. */
436
437int
438compare_full (const void *p1, const void *p2)
439{
440 char **line1 = (char **) p1;
441 char **line2 = (char **) p2;
442 int i;
443
444 /* Compare using the first keyfield;
445 if that does not distinguish the lines, try the second keyfield;
446 and so on. */
447
448 for (i = 0; i < num_keyfields; i++)
449 {
450 long length1, length2;
451 char *start1 = find_field (&keyfields[i], *line1, &length1);
452 char *start2 = find_field (&keyfields[i], *line2, &length2);
453 int tem = compare_field (&keyfields[i], start1, length1,
454 *line1 - text_base,
455 start2, length2, *line2 - text_base);
456 if (tem)
457 {
458 if (keyfields[i].reverse)
459 return -tem;
460 return tem;
461 }
462 }
463
464 return 0; /* Lines match exactly. */
465}
466
467/* Compare LINE1 and LINE2, described by structures
468 in which the first keyfield is identified in advance.
469 For positional sorting, assumes that the order of the lines in core
470 reflects their nominal order. */
471int
472compare_prepared (const void *p1, const void *p2)
473{
474 struct lineinfo *line1 = (struct lineinfo *) p1;
475 struct lineinfo *line2 = (struct lineinfo *) p2;
476 int i;
477 int tem;
478 char *text1, *text2;
479
480 /* Compare using the first keyfield, which has been found for us already. */
481 if (keyfields->positional)
482 {
483 if (line1->text - text_base > line2->text - text_base)
484 tem = 1;
485 else
486 tem = -1;
487 }
488 else if (keyfields->numeric)
489 tem = line1->key.number - line2->key.number;
490 else
491 tem = compare_field (keyfields, line1->key.text, line1->keylen, 0,
492 line2->key.text, line2->keylen, 0);
493 if (tem)
494 {
495 if (keyfields->reverse)
496 return -tem;
497 return tem;
498 }
499
500 text1 = line1->text;
501 text2 = line2->text;
502
503 /* Compare using the second keyfield;
504 if that does not distinguish the lines, try the third keyfield;
505 and so on. */
506
507 for (i = 1; i < num_keyfields; i++)
508 {
509 long length1, length2;
510 char *start1 = find_field (&keyfields[i], text1, &length1);
511 char *start2 = find_field (&keyfields[i], text2, &length2);
512 int tem = compare_field (&keyfields[i], start1, length1,
513 text1 - text_base,
514 start2, length2, text2 - text_base);
515 if (tem)
516 {
517 if (keyfields[i].reverse)
518 return -tem;
519 return tem;
520 }
521 }
522
523 return 0; /* Lines match exactly. */
524}
525
526/* Like compare_full but more general.
527 You can pass any strings, and you can say how many keyfields to use.
528 POS1 and POS2 should indicate the nominal positional ordering of
529 the two lines in the input. */
530
531int
532compare_general (char *str1, char *str2, long int pos1, long int pos2, int use_keyfields)
533{
534 int i;
535
536 /* Compare using the first keyfield;
537 if that does not distinguish the lines, try the second keyfield;
538 and so on. */
539
540 for (i = 0; i < use_keyfields; i++)
541 {
542 long length1, length2;
543 char *start1 = find_field (&keyfields[i], str1, &length1);
544 char *start2 = find_field (&keyfields[i], str2, &length2);
545 int tem = compare_field (&keyfields[i], start1, length1, pos1,
546 start2, length2, pos2);
547 if (tem)
548 {
549 if (keyfields[i].reverse)
550 return -tem;
551 return tem;
552 }
553 }
554
555 return 0; /* Lines match exactly. */
556}
557
558/* Find the start and length of a field in STR according to KEYFIELD.
559 A pointer to the starting character is returned, and the length
560 is stored into the int that LENGTHPTR points to. */
561
562char *
563find_field (struct keyfield *keyfield, char *str, long int *lengthptr)
564{
565 char *start;
566 char *end;
567 char *(*fun) ();
568
569 if (keyfield->braced)
570 fun = find_braced_pos;
571 else
572 fun = find_pos;
573
574 start = (*fun) (str, keyfield->startwords, keyfield->startchars,
575 keyfield->ignore_blanks);
576 if (keyfield->endwords < 0)
577 {
578 if (keyfield->braced)
579 end = find_braced_end (start);
580 else
581 {
582 end = start;
583 while (*end && *end != '\n')
584 end++;
585 }
586 }
587 else
588 {
589 end = (*fun) (str, keyfield->endwords, keyfield->endchars, 0);
590 if (end - str < start - str)
591 end = start;
592 }
593 *lengthptr = end - start;
594 return start;
595}
596
597/* Return a pointer to a specified place within STR,
598 skipping (from the beginning) WORDS words and then CHARS chars.
599 If IGNORE_BLANKS is nonzero, we skip all blanks
600 after finding the specified word. */
601
602char *
603find_pos (char *str, int words, int chars, int ignore_blanks)
604{
605 int i;
606 char *p = str;
607
608 for (i = 0; i < words; i++)
609 {
610 char c;
611 /* Find next bunch of nonblanks and skip them. */
612 while ((c = *p) == ' ' || c == '\t')
613 p++;
614 while ((c = *p) && c != '\n' && !(c == ' ' || c == '\t'))
615 p++;
616 if (!*p || *p == '\n')
617 return p;
618 }
619
620 while (*p == ' ' || *p == '\t')
621 p++;
622
623 for (i = 0; i < chars; i++)
624 {
625 if (!*p || *p == '\n')
626 break;
627 p++;
628 }
629 return p;
630}
631
632/* Like find_pos but assumes that each field is surrounded by braces
633 and that braces within fields are balanced. */
634
635char *
636find_braced_pos (char *str, int words, int chars, int ignore_blanks)
637{
638 int i;
639 int bracelevel;
640 char *p = str;
641 char c;
642
643 for (i = 0; i < words; i++)
644 {
645 bracelevel = 1;
646 while ((c = *p++) != '{' && c != '\n' && c)
647 /* Do nothing. */ ;
648 if (c != '{')
649 return p - 1;
650 while (bracelevel)
651 {
652 c = *p++;
653 if (c == '{')
654 bracelevel++;
655 if (c == '}')
656 bracelevel--;
657 if (c == 0 || c == '\n')
658 return p - 1;
659 }
660 }
661
662 while ((c = *p++) != '{' && c != '\n' && c)
663 /* Do nothing. */ ;
664
665 if (c != '{')
666 return p - 1;
667
668 if (ignore_blanks)
669 while ((c = *p) == ' ' || c == '\t')
670 p++;
671
672 for (i = 0; i < chars; i++)
673 {
674 if (!*p || *p == '\n')
675 break;
676 p++;
677 }
678 return p;
679}
680
681/* Find the end of the balanced-brace field which starts at STR.
682 The position returned is just before the closing brace. */
683
684char *
685find_braced_end (char *str)
686{
687 int bracelevel;
688 char *p = str;
689 char c;
690
691 bracelevel = 1;
692 while (bracelevel)
693 {
694 c = *p++;
695 if (c == '{')
696 bracelevel++;
697 if (c == '}')
698 bracelevel--;
699 if (c == 0 || c == '\n')
700 return p - 1;
701 }
702 return p - 1;
703}
704
705long
706find_value (char *start, long int length)
707{
708 while (length != 0L)
709 {
710 if (isdigit (*start))
711 return atol (start);
712 length--;
713 start++;
714 }
715 return 0l;
716}
717
718/* Vector used to translate characters for comparison.
719 This is how we make all alphanumerics follow all else,
720 and ignore case in the first sorting. */
721int char_order[256];
722
723void
724init_char_order (void)
725{
726 int i;
727 for (i = 1; i < 256; i++)
728 char_order[i] = i;
729
730 for (i = '0'; i <= '9'; i++)
731 char_order[i] += 512;
732
733 for (i = 'a'; i <= 'z'; i++)
734 {
735 char_order[i] = 512 + i;
736 char_order[i + 'A' - 'a'] = 512 + i;
737 }
738}
739
740/* Compare two fields (each specified as a start pointer and a character count)
741 according to KEYFIELD.
742 The sign of the value reports the relation between the fields. */
743
744int
745compare_field (struct keyfield *keyfield, char *start1, long int length1,
746 long int pos1, char *start2, long int length2, long int pos2)
747{
748 if (keyfields->positional)
749 {
750 if (pos1 > pos2)
751 return 1;
752 else
753 return -1;
754 }
755 if (keyfield->numeric)
756 {
757 long value = find_value (start1, length1) - find_value (start2, length2);
758 if (value > 0)
759 return 1;
760 if (value < 0)
761 return -1;
762 return 0;
763 }
764 else
765 {
766 char *p1 = start1;
767 char *p2 = start2;
768 char *e1 = start1 + length1;
769 char *e2 = start2 + length2;
770
771 while (1)
772 {
773 int c1, c2;
774
775 if (p1 == e1)
776 c1 = 0;
777 else
778 c1 = *p1++;
779 if (p2 == e2)
780 c2 = 0;
781 else
782 c2 = *p2++;
783
784 if (char_order[c1] != char_order[c2])
785 return char_order[c1] - char_order[c2];
786 if (!c1)
787 break;
788 }
789
790 /* Strings are equal except possibly for case. */
791 p1 = start1;
792 p2 = start2;
793 while (1)
794 {
795 int c1, c2;
796
797 if (p1 == e1)
798 c1 = 0;
799 else
800 c1 = *p1++;
801 if (p2 == e2)
802 c2 = 0;
803 else
804 c2 = *p2++;
805
806 if (c1 != c2)
807 /* Reverse sign here so upper case comes out last. */
808 return c2 - c1;
809 if (!c1)
810 break;
811 }
812
813 return 0;
814 }
815}
816
817
818/* A `struct linebuffer' is a structure which holds a line of text.
819 `readline' reads a line from a stream into a linebuffer
820 and works regardless of the length of the line. */
821
822struct linebuffer
823{
824 long size;
825 char *buffer;
826};
827
828/* Initialize LINEBUFFER for use. */
829
830void
831initbuffer (struct linebuffer *linebuffer)
832{
833 linebuffer->size = 200;
834 linebuffer->buffer = (char *) xmalloc (200);
835}
836
837/* Read a line of text from STREAM into LINEBUFFER.
838 Return the length of the line. */
839
840long
841readline (struct linebuffer *linebuffer, FILE *stream)
842{
843 char *buffer = linebuffer->buffer;
844 char *p = linebuffer->buffer;
845 char *end = p + linebuffer->size;
846
847 while (1)
848 {
849 int c = getc (stream);
850 if (p == end)
851 {
852 buffer = (char *) xrealloc (buffer, linebuffer->size *= 2);
853 p += buffer - linebuffer->buffer;
854 end += buffer - linebuffer->buffer;
855 linebuffer->buffer = buffer;
856 }
857 if (c < 0 || c == '\n')
858 {
859 *p = 0;
860 break;
861 }
862 *p++ = c;
863 }
864
865 return p - buffer;
866}
867
868
869/* Sort an input file too big to sort in core. */
870
871void
872sort_offline (char *infile, off_t total, char *outfile)
873{
874 /* More than enough. */
875 int ntemps = 2 * (total + MAX_IN_CORE_SORT - 1) / MAX_IN_CORE_SORT;
876 char **tempfiles = (char **) xmalloc (ntemps * sizeof (char *));
877 FILE *istream = fopen (infile, "r");
878 int i;
879 struct linebuffer lb;
880 long linelength;
881 int failure = 0;
882
883 initbuffer (&lb);
884
885 /* Read in one line of input data. */
886
887 linelength = readline (&lb, istream);
888
889 if (lb.buffer[0] != '\\' && lb.buffer[0] != '@')
890 {
891 error (_("%s: not a texinfo index file"), infile);
892 return;
893 }
894
895 /* Split up the input into `ntemps' temporary files, or maybe fewer,
896 and put the new files' names into `tempfiles' */
897
898 for (i = 0; i < ntemps; i++)
899 {
900 char *outname = maketempname (++tempcount);
901 FILE *ostream = fopen (outname, "w");
902 long tempsize = 0;
903
904 if (!ostream)
905 pfatal_with_name (outname);
906 tempfiles[i] = outname;
907
908 /* Copy lines into this temp file as long as it does not make file
909 "too big" or until there are no more lines. */
910
911 while (tempsize + linelength + 1 <= MAX_IN_CORE_SORT)
912 {
913 tempsize += linelength + 1;
914 fputs (lb.buffer, ostream);
915 putc ('\n', ostream);
916
917 /* Read another line of input data. */
918
919 linelength = readline (&lb, istream);
920 if (!linelength && feof (istream))
921 break;
922
923 if (lb.buffer[0] != '\\' && lb.buffer[0] != '@')
924 {
925 error (_("%s: not a texinfo index file"), infile);
926 failure = 1;
927 goto fail;
928 }
929 }
930 fclose (ostream);
931 if (feof (istream))
932 break;
933 }
934
935 free (lb.buffer);
936
937fail:
938 /* Record number of temp files we actually needed. */
939
940 ntemps = i;
941
942 /* Sort each tempfile into another tempfile.
943 Delete the first set of tempfiles and put the names of the second
944 into `tempfiles'. */
945
946 for (i = 0; i < ntemps; i++)
947 {
948 char *newtemp = maketempname (++tempcount);
949 sort_in_core (tempfiles[i], MAX_IN_CORE_SORT, newtemp);
950 if (!keep_tempfiles)
951 unlink (tempfiles[i]);
952 tempfiles[i] = newtemp;
953 }
954
955 if (failure)
956 return;
957
958 /* Merge the tempfiles together and indexify. */
959
960 merge_files (tempfiles, ntemps, outfile);
961}
962
963
964/* Sort INFILE, whose size is TOTAL,
965 assuming that is small enough to be done in-core,
966 then indexify it and send the output to OUTFILE (or to stdout). */
967
968void
969sort_in_core (char *infile, int total, char *outfile)
970{
971 char **nextline;
972 char *data = (char *) xmalloc (total + 1);
973 char *file_data;
974 long file_size;
975 int i;
976 FILE *ostream = stdout;
977 struct lineinfo *lineinfo;
978
979 /* Read the contents of the file into the moby array `data'. */
980
981 int desc = open (infile, O_RDONLY, 0);
982
983 if (desc < 0)
984 fatal (_("failure reopening %s"), infile);
985 for (file_size = 0;;)
986 {
987 i = read (desc, data + file_size, total - file_size);
988 if (i <= 0)
989 break;
990 file_size += i;
991 }
992 file_data = data;
993 data[file_size] = 0;
994
995 close (desc);
996
997 if (file_size > 0 && data[0] != '\\' && data[0] != '@')
998 {
999 error (_("%s: not a texinfo index file"), infile);
1000 return;
1001 }
1002
1003 init_char_order ();
1004
1005 /* Sort routines want to know this address. */
1006
1007 text_base = data;
1008
1009 /* Create the array of pointers to lines, with a default size
1010 frequently enough. */
1011
1012 nlines = total / 50;
1013 if (!nlines)
1014 nlines = 2;
1015 linearray = (char **) xmalloc (nlines * sizeof (char *));
1016
1017 /* `nextline' points to the next free slot in this array.
1018 `nlines' is the allocated size. */
1019
1020 nextline = linearray;
1021
1022 /* Parse the input file's data, and make entries for the lines. */
1023
1024 nextline = parsefile (infile, nextline, file_data, file_size);
1025 if (nextline == 0)
1026 {
1027 error (_("%s: not a texinfo index file"), infile);
1028 return;
1029 }
1030
1031 /* Sort the lines. */
1032
1033 /* If we have enough space, find the first keyfield of each line in advance.
1034 Make a `struct lineinfo' for each line, which records the keyfield
1035 as well as the line, and sort them. */
1036
1037 lineinfo = malloc ((nextline - linearray) * sizeof (struct lineinfo));
1038
1039 if (lineinfo)
1040 {
1041 struct lineinfo *lp;
1042 char **p;
1043
1044 for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
1045 {
1046 lp->text = *p;
1047 lp->key.text = find_field (keyfields, *p, &lp->keylen);
1048 if (keyfields->numeric)
1049 lp->key.number = find_value (lp->key.text, lp->keylen);
1050 }
1051
1052 qsort (lineinfo, nextline - linearray, sizeof (struct lineinfo),
1053 compare_prepared);
1054
1055 for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
1056 *p = lp->text;
1057
1058 free (lineinfo);
1059 }
1060 else
1061 qsort (linearray, nextline - linearray, sizeof (char *), compare_full);
1062
1063 /* Open the output file. */
1064
1065 if (outfile)
1066 {
1067 ostream = fopen (outfile, "w");
1068 if (!ostream)
1069 pfatal_with_name (outfile);
1070 }
1071
1072 writelines (linearray, nextline - linearray, ostream);
1073 if (outfile)
1074 fclose (ostream);
1075
1076 free (linearray);
1077 free (data);
1078}
1079
1080
1081/* Parse an input string in core into lines.
1082 DATA is the input string, and SIZE is its length.
1083 Data goes in LINEARRAY starting at NEXTLINE.
1084 The value returned is the first entry in LINEARRAY still unused.
1085 Value 0 means input file contents are invalid. */
1086
1087char **
1088parsefile (char *filename, char **nextline, char *data, long int size)
1089{
1090 char *p, *end;
1091 char **line = nextline;
1092
1093 p = data;
1094 end = p + size;
1095 *end = 0;
1096
1097 while (p != end)
1098 {
1099 if (p[0] != '\\' && p[0] != '@')
1100 return 0;
1101
1102 *line = p;
1103
1104 /* Find the first letter of the first field of this line. If it
1105 is different from the first letter of the first field of the
1106 first line, we need initial headers in the output index. */
1107 while (*p && *p != '{')
1108 p++;
1109 if (p == end)
1110 return 0;
1111 p++;
1112 if (first_initial)
1113 {
1114 if (first_initial != toupper (*p))
1115 need_initials = 1;
1116 }
1117 else
1118 first_initial = toupper (*p);
1119
1120 while (*p && *p != '\n')
1121 p++;
1122 if (p != end)
1123 p++;
1124
1125 line++;
1126 if (line == linearray + nlines)
1127 {
1128 char **old = linearray;
1129 linearray = xrealloc (linearray, sizeof (char *) * (nlines *= 4));
1130 line += linearray - old;
1131 }
1132 }
1133
1134 return line;
1135}
1136
1137
1138/* Indexification is a filter applied to the sorted lines
1139 as they are being written to the output file.
1140 Multiple entries for the same name, with different page numbers,
1141 get combined into a single entry with multiple page numbers.
1142 The first braced field, which is used for sorting, is discarded.
1143 However, its first character is examined, folded to lower case,
1144 and if it is different from that in the previous line fed to us
1145 a \initial line is written with one argument, the new initial.
1146
1147 If an entry has four braced fields, then the second and third
1148 constitute primary and secondary names.
1149 In this case, each change of primary name
1150 generates a \primary line which contains only the primary name,
1151 and in between these are \secondary lines which contain
1152 just a secondary name and page numbers. */
1153
1154/* The last primary name we wrote a \primary entry for.
1155 If only one level of indexing is being done, this is the last name seen. */
1156char *lastprimary;
1157/* Length of storage allocated for lastprimary. */
1158int lastprimarylength;
1159
1160/* Similar, for the secondary name. */
1161char *lastsecondary;
1162int lastsecondarylength;
1163
1164/* Zero if we are not in the middle of writing an entry.
1165 One if we have written the beginning of an entry but have not
1166 yet written any page numbers into it.
1167 Greater than one if we have written the beginning of an entry
1168 plus at least one page number. */
1169int pending;
1170
1171/* The initial (for sorting purposes) of the last primary entry written.
1172 When this changes, a \initial {c} line is written */
1173
1174char *lastinitial;
1175
1176int lastinitiallength;
1177
1178/* When we need a string of length 1 for the value of lastinitial,
1179 store it here. */
1180
1181char lastinitial1[2];
1182
1183/* Initialize static storage for writing an index. */
1184
1185void
1186init_index (void)
1187{
1188 pending = 0;
1189 lastinitial = lastinitial1;
1190 lastinitial1[0] = 0;
1191 lastinitial1[1] = 0;
1192 lastinitiallength = 0;
1193 lastprimarylength = 100;
1194 lastprimary = (char *) xmalloc (lastprimarylength + 1);
1195 memset (lastprimary, '\0', lastprimarylength + 1);
1196 lastsecondarylength = 100;
1197 lastsecondary = (char *) xmalloc (lastsecondarylength + 1);
1198 memset (lastsecondary, '\0', lastsecondarylength + 1);
1199}
1200
1201/* Indexify. Merge entries for the same name,
1202 insert headers for each initial character, etc. */
1203
1204void
1205indexify (char *line, FILE *ostream)
1206{
1207 char *primary, *secondary, *pagenumber;
1208 int primarylength, secondarylength = 0, pagelength;
1209 int nosecondary;
1210 int initiallength;
1211 char *initial;
1212 char initial1[2];
1213 register char *p;
1214
1215 /* First, analyze the parts of the entry fed to us this time. */
1216
1217 p = find_braced_pos (line, 0, 0, 0);
1218 if (*p == '{')
1219 {
1220 initial = p;
1221 /* Get length of inner pair of braces starting at `p',
1222 including that inner pair of braces. */
1223 initiallength = find_braced_end (p + 1) + 1 - p;
1224 }
1225 else
1226 {
1227 initial = initial1;
1228 initial1[0] = toupper (*p);
1229 initial1[1] = 0;
1230 initiallength = 1;
1231 }
1232
1233 pagenumber = find_braced_pos (line, 1, 0, 0);
1234 pagelength = find_braced_end (pagenumber) - pagenumber;
1235 if (pagelength == 0)
1236 fatal (_("No page number in %s"), line);
1237
1238 primary = find_braced_pos (line, 2, 0, 0);
1239 primarylength = find_braced_end (primary) - primary;
1240
1241 secondary = find_braced_pos (line, 3, 0, 0);
1242 nosecondary = !*secondary;
1243 if (!nosecondary)
1244 secondarylength = find_braced_end (secondary) - secondary;
1245
1246 /* If the primary is different from before, make a new primary entry. */
1247 if (strncmp (primary, lastprimary, primarylength))
1248 {
1249 /* Close off current secondary entry first, if one is open. */
1250 if (pending)
1251 {
1252 fputs ("}\n", ostream);
1253 pending = 0;
1254 }
1255
1256 /* If this primary has a different initial, include an entry for
1257 the initial. */
1258 if (need_initials &&
1259 (initiallength != lastinitiallength ||
1260 strncmp (initial, lastinitial, initiallength)))
1261 {
1262 fprintf (ostream, "\\initial {");
1263 fwrite (initial, 1, initiallength, ostream);
1264 fputs ("}\n", ostream);
1265 if (initial == initial1)
1266 {
1267 lastinitial = lastinitial1;
1268 *lastinitial1 = *initial1;
1269 }
1270 else
1271 {
1272 lastinitial = initial;
1273 }
1274 lastinitiallength = initiallength;
1275 }
1276
1277 /* Make the entry for the primary. */
1278 if (nosecondary)
1279 fputs ("\\entry {", ostream);
1280 else
1281 fputs ("\\primary {", ostream);
1282 fwrite (primary, primarylength, 1, ostream);
1283 if (nosecondary)
1284 {
1285 fputs ("}{", ostream);
1286 pending = 1;
1287 }
1288 else
1289 fputs ("}\n", ostream);
1290
1291 /* Record name of most recent primary. */
1292 if (lastprimarylength < primarylength)
1293 {
1294 lastprimarylength = primarylength + 100;
1295 lastprimary = (char *) xrealloc (lastprimary,
1296 1 + lastprimarylength);
1297 }
1298 strncpy (lastprimary, primary, primarylength);
1299 lastprimary[primarylength] = 0;
1300
1301 /* There is no current secondary within this primary, now. */
1302 lastsecondary[0] = 0;
1303 }
1304
1305 /* Should not have an entry with no subtopic following one with a
1306 subtopic. */
1307
1308 if (nosecondary && *lastsecondary)
1309 error (_("entry %s follows an entry with a secondary name"), line);
1310
1311 /* Start a new secondary entry if necessary. */
1312 if (!nosecondary && strncmp (secondary, lastsecondary, secondarylength))
1313 {
1314 if (pending)
1315 {
1316 fputs ("}\n", ostream);
1317 pending = 0;
1318 }
1319
1320 /* Write the entry for the secondary. */
1321 fputs ("\\secondary {", ostream);
1322 fwrite (secondary, secondarylength, 1, ostream);
1323 fputs ("}{", ostream);
1324 pending = 1;
1325
1326 /* Record name of most recent secondary. */
1327 if (lastsecondarylength < secondarylength)
1328 {
1329 lastsecondarylength = secondarylength + 100;
1330 lastsecondary = (char *) xrealloc (lastsecondary,
1331 1 + lastsecondarylength);
1332 }
1333 strncpy (lastsecondary, secondary, secondarylength);
1334 lastsecondary[secondarylength] = 0;
1335 }
1336
1337 /* Here to add one more page number to the current entry. */
1338 if (pending++ != 1)
1339 fputs (", ", ostream); /* Punctuate first, if this is not the first. */
1340 fwrite (pagenumber, pagelength, 1, ostream);
1341}
1342
1343/* Close out any unfinished output entry. */
1344
1345void
1346finish_index (FILE *ostream)
1347{
1348 if (pending)
1349 fputs ("}\n", ostream);
1350 free (lastprimary);
1351 free (lastsecondary);
1352}
1353
1354
1355/* Copy the lines in the sorted order.
1356 Each line is copied out of the input file it was found in. */
1357
1358void
1359writelines (char **linearray, int nlines, FILE *ostream)
1360{
1361 char **stop_line = linearray + nlines;
1362 char **next_line;
1363
1364 init_index ();
1365
1366 /* Output the text of the lines, and free the buffer space. */
1367
1368 for (next_line = linearray; next_line != stop_line; next_line++)
1369 {
1370 /* If -u was specified, output the line only if distinct from
1371 previous one. */
1372 if (next_line == linearray
1373 /* Compare previous line with this one, using only the
1374 explicitly specd keyfields. */
1375 || compare_general (*(next_line - 1), *next_line, 0L, 0L,
1376 num_keyfields - 1))
1377 {
1378 char *p = *next_line;
1379 char c;
1380
1381 while ((c = *p++) && c != '\n')
1382 /* Do nothing. */ ;
1383 *(p - 1) = 0;
1384 indexify (*next_line, ostream);
1385 }
1386 }
1387
1388 finish_index (ostream);
1389}
1390
1391
1392/* Assume (and optionally verify) that each input file is sorted;
1393 merge them and output the result.
1394 Returns nonzero if any input file fails to be sorted.
1395
1396 This is the high-level interface that can handle an unlimited
1397 number of files. */
1398
1399#define MAX_DIRECT_MERGE 10
1400
1401int
1402merge_files (char **infiles, int nfiles, char *outfile)
1403{
1404 char **tempfiles;
1405 int ntemps;
1406 int i;
1407 int value = 0;
1408 int start_tempcount = tempcount;
1409
1410 if (nfiles <= MAX_DIRECT_MERGE)
1411 return merge_direct (infiles, nfiles, outfile);
1412
1413 /* Merge groups of MAX_DIRECT_MERGE input files at a time,
1414 making a temporary file to hold each group's result. */
1415
1416 ntemps = (nfiles + MAX_DIRECT_MERGE - 1) / MAX_DIRECT_MERGE;
1417 tempfiles = (char **) xmalloc (ntemps * sizeof (char *));
1418 for (i = 0; i < ntemps; i++)
1419 {
1420 int nf = MAX_DIRECT_MERGE;
1421 if (i + 1 == ntemps)
1422 nf = nfiles - i * MAX_DIRECT_MERGE;
1423 tempfiles[i] = maketempname (++tempcount);
1424 value |= merge_direct (&infiles[i * MAX_DIRECT_MERGE], nf, tempfiles[i]);
1425 }
1426
1427 /* All temporary files that existed before are no longer needed
1428 since their contents have been merged into our new tempfiles.
1429 So delete them. */
1430 flush_tempfiles (start_tempcount);
1431
1432 /* Now merge the temporary files we created. */
1433
1434 merge_files (tempfiles, ntemps, outfile);
1435
1436 free (tempfiles);
1437
1438 return value;
1439}
1440
1441
1442/* Assume (and optionally verify) that each input file is sorted;
1443 merge them and output the result.
1444 Returns nonzero if any input file fails to be sorted.
1445
1446 This version of merging will not work if the number of
1447 input files gets too high. Higher level functions
1448 use it only with a bounded number of input files. */
1449
1450int
1451merge_direct (char **infiles, int nfiles, char *outfile)
1452{
1453 struct linebuffer *lb1, *lb2;
1454 struct linebuffer **thisline, **prevline;
1455 FILE **streams;
1456 int i;
1457 int nleft;
1458 int lossage = 0;
1459 int *file_lossage;
1460 struct linebuffer *prev_out = 0;
1461 FILE *ostream = stdout;
1462
1463 if (outfile)
1464 {
1465 ostream = fopen (outfile, "w");
1466 }
1467 if (!ostream)
1468 pfatal_with_name (outfile);
1469
1470 init_index ();
1471
1472 if (nfiles == 0)
1473 {
1474 if (outfile)
1475 fclose (ostream);
1476 return 0;
1477 }
1478
1479 /* For each file, make two line buffers. Also, for each file, there
1480 is an element of `thisline' which points at any time to one of the
1481 file's two buffers, and an element of `prevline' which points to
1482 the other buffer. `thisline' is supposed to point to the next
1483 available line from the file, while `prevline' holds the last file
1484 line used, which is remembered so that we can verify that the file
1485 is properly sorted. */
1486
1487 /* lb1 and lb2 contain one buffer each per file. */
1488 lb1 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer));
1489 lb2 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer));
1490
1491 /* thisline[i] points to the linebuffer holding the next available
1492 line in file i, or is zero if there are no lines left in that file. */
1493 thisline = (struct linebuffer **)
1494 xmalloc (nfiles * sizeof (struct linebuffer *));
1495 /* prevline[i] points to the linebuffer holding the last used line
1496 from file i. This is just for verifying that file i is properly
1497 sorted. */
1498 prevline = (struct linebuffer **)
1499 xmalloc (nfiles * sizeof (struct linebuffer *));
1500 /* streams[i] holds the input stream for file i. */
1501 streams = (FILE **) xmalloc (nfiles * sizeof (FILE *));
1502 /* file_lossage[i] is nonzero if we already know file i is not
1503 properly sorted. */
1504 file_lossage = (int *) xmalloc (nfiles * sizeof (int));
1505
1506 /* Allocate and initialize all that storage. */
1507
1508 for (i = 0; i < nfiles; i++)
1509 {
1510 initbuffer (&lb1[i]);
1511 initbuffer (&lb2[i]);
1512 thisline[i] = &lb1[i];
1513 prevline[i] = &lb2[i];
1514 file_lossage[i] = 0;
1515 streams[i] = fopen (infiles[i], "r");
1516 if (!streams[i])
1517 pfatal_with_name (infiles[i]);
1518
1519 readline (thisline[i], streams[i]);
1520 }
1521
1522 /* Keep count of number of files not at eof. */
1523 nleft = nfiles;
1524
1525 while (nleft)
1526 {
1527 struct linebuffer *best = 0;
1528 struct linebuffer *exch;
1529 int bestfile = -1;
1530 int i;
1531
1532 /* Look at the next avail line of each file; choose the least one. */
1533
1534 for (i = 0; i < nfiles; i++)
1535 {
1536 if (thisline[i] &&
1537 (!best ||
1538 0 < compare_general (best->buffer, thisline[i]->buffer,
1539 (long) bestfile, (long) i, num_keyfields)))
1540 {
1541 best = thisline[i];
1542 bestfile = i;
1543 }
1544 }
1545
1546 /* Output that line, unless it matches the previous one and we
1547 don't want duplicates. */
1548
1549 if (!(prev_out &&
1550 !compare_general (prev_out->buffer,
1551 best->buffer, 0L, 1L, num_keyfields - 1)))
1552 indexify (best->buffer, ostream);
1553 prev_out = best;
1554
1555 /* Now make the line the previous of its file, and fetch a new
1556 line from that file. */
1557
1558 exch = prevline[bestfile];
1559 prevline[bestfile] = thisline[bestfile];
1560 thisline[bestfile] = exch;
1561
1562 while (1)
1563 {
1564 /* If the file has no more, mark it empty. */
1565
1566 if (feof (streams[bestfile]))
1567 {
1568 thisline[bestfile] = 0;
1569 /* Update the number of files still not empty. */
1570 nleft--;
1571 break;
1572 }
1573 readline (thisline[bestfile], streams[bestfile]);
1574 if (thisline[bestfile]->buffer[0] || !feof (streams[bestfile]))
1575 break;
1576 }
1577 }
1578
1579 finish_index (ostream);
1580
1581 /* Free all storage and close all input streams. */
1582
1583 for (i = 0; i < nfiles; i++)
1584 {
1585 fclose (streams[i]);
1586 free (lb1[i].buffer);
1587 free (lb2[i].buffer);
1588 }
1589 free (file_lossage);
1590 free (lb1);
1591 free (lb2);
1592 free (thisline);
1593 free (prevline);
1594 free (streams);
1595
1596 if (outfile)
1597 fclose (ostream);
1598
1599 return lossage;
1600}
1601
1602
1603/* Print error message and exit. */
1604
1605void
1606fatal (const char *format, const char *arg)
1607{
1608 error (format, arg);
1609 xexit (1);
1610}
1611
1612/* Print error message. FORMAT is printf control string, ARG is arg for it. */
1613void
1614error (const char *format, const char *arg)
1615{
1616 printf ("%s: ", program_name);
1617 printf (format, arg);
1618 if (format[strlen (format) -1] != '\n')
1619 printf ("\n");
1620}
1621
1622void
1623perror_with_name (const char *name)
1624{
1625 fprintf (stderr, "%s: ", program_name);
1626 perror (name);
1627}
1628
1629void
1630pfatal_with_name (const char *name)
1631{
1632 perror_with_name (name);
1633 xexit (1);
1634}
1635
1636
1637
1638/* Return a newly-allocated string concatenating S1 and S2. */
1639
1640char *
1641concat (char *s1, char *s2)
1642{
1643 int len1 = strlen (s1), len2 = strlen (s2);
1644 char *result = (char *) xmalloc (len1 + len2 + 1);
1645
1646 strcpy (result, s1);
1647 strcpy (result + len1, s2);
1648 *(result + len1 + len2) = 0;
1649
1650 return result;
1651}
Note: See TracBrowser for help on using the repository browser.