1 | /* texindex -- sort TeX index dribble output into an actual index.
|
---|
2 | $Id: texindex.c,v 1.11 2004/04/11 17:56:47 karl Exp $
|
---|
3 |
|
---|
4 | Copyright (C) 1987, 1991, 1992, 1996, 1997, 1998, 1999, 2000, 2001,
|
---|
5 | 2002, 2003, 2004 Free Software Foundation, Inc.
|
---|
6 |
|
---|
7 | This program is free software; you can redistribute it and/or modify
|
---|
8 | it under the terms of the GNU General Public License as published by
|
---|
9 | the Free Software Foundation; either version 2, or (at your option)
|
---|
10 | any later version.
|
---|
11 |
|
---|
12 | This program is distributed in the hope that it will be useful,
|
---|
13 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
14 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
15 | GNU General Public License for more details.
|
---|
16 |
|
---|
17 | You should have received a copy of the GNU General Public License
|
---|
18 | along with this program; if not, write to the Free Software
|
---|
19 | Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307. */
|
---|
20 |
|
---|
21 | #include "system.h"
|
---|
22 | #include <getopt.h>
|
---|
23 |
|
---|
24 | #ifdef __EMX__
|
---|
25 | # include <stdlib.h>
|
---|
26 | #endif
|
---|
27 |
|
---|
28 | static char *program_name = "texindex";
|
---|
29 |
|
---|
30 | #if defined (emacs)
|
---|
31 | # include "../src/config.h"
|
---|
32 | /* Some s/os.h files redefine these. */
|
---|
33 | # undef read
|
---|
34 | # undef close
|
---|
35 | # undef write
|
---|
36 | # undef open
|
---|
37 | #endif
|
---|
38 |
|
---|
39 | #if !defined (HAVE_MEMSET)
|
---|
40 | #undef memset
|
---|
41 | #define memset(ptr, ignore, count) bzero (ptr, count)
|
---|
42 | #endif
|
---|
43 |
|
---|
44 | char *mktemp (char *);
|
---|
45 |
|
---|
46 | #if !defined (SEEK_SET)
|
---|
47 | # define SEEK_SET 0
|
---|
48 | # define SEEK_CUR 1
|
---|
49 | # define SEEK_END 2
|
---|
50 | #endif /* !SEEK_SET */
|
---|
51 |
|
---|
52 | struct linebuffer;
|
---|
53 |
|
---|
54 | /* When sorting in core, this structure describes one line
|
---|
55 | and the position and length of its first keyfield. */
|
---|
56 | struct lineinfo
|
---|
57 | {
|
---|
58 | char *text; /* The actual text of the line. */
|
---|
59 | union {
|
---|
60 | char *text; /* The start of the key (for textual comparison). */
|
---|
61 | long number; /* The numeric value (for numeric comparison). */
|
---|
62 | } key;
|
---|
63 | long keylen; /* Length of KEY field. */
|
---|
64 | };
|
---|
65 |
|
---|
66 | /* This structure describes a field to use as a sort key. */
|
---|
67 | struct keyfield
|
---|
68 | {
|
---|
69 | int startwords; /* Number of words to skip. */
|
---|
70 | int startchars; /* Number of additional chars to skip. */
|
---|
71 | int endwords; /* Number of words to ignore at end. */
|
---|
72 | int endchars; /* Ditto for characters of last word. */
|
---|
73 | char ignore_blanks; /* Non-zero means ignore spaces and tabs. */
|
---|
74 | char fold_case; /* Non-zero means case doesn't matter. */
|
---|
75 | char reverse; /* Non-zero means compare in reverse order. */
|
---|
76 | char numeric; /* Non-zeros means field is ASCII numeric. */
|
---|
77 | char positional; /* Sort according to file position. */
|
---|
78 | char braced; /* Count balanced-braced groupings as fields. */
|
---|
79 | };
|
---|
80 |
|
---|
81 | /* Vector of keyfields to use. */
|
---|
82 | struct keyfield keyfields[3];
|
---|
83 |
|
---|
84 | /* Number of keyfields stored in that vector. */
|
---|
85 | int num_keyfields = 3;
|
---|
86 |
|
---|
87 | /* Vector of input file names, terminated with a null pointer. */
|
---|
88 | char **infiles;
|
---|
89 |
|
---|
90 | /* Vector of corresponding output file names, or NULL, meaning default it
|
---|
91 | (add an `s' to the end). */
|
---|
92 | char **outfiles;
|
---|
93 |
|
---|
94 | /* Length of `infiles'. */
|
---|
95 | int num_infiles;
|
---|
96 |
|
---|
97 | /* Pointer to the array of pointers to lines being sorted. */
|
---|
98 | char **linearray;
|
---|
99 |
|
---|
100 | /* The allocated length of `linearray'. */
|
---|
101 | long nlines;
|
---|
102 |
|
---|
103 | /* Directory to use for temporary files. On Unix, it ends with a slash. */
|
---|
104 | char *tempdir;
|
---|
105 |
|
---|
106 | /* Number of last temporary file. */
|
---|
107 | int tempcount;
|
---|
108 |
|
---|
109 | /* Number of last temporary file already deleted.
|
---|
110 | Temporary files are deleted by `flush_tempfiles' in order of creation. */
|
---|
111 | int last_deleted_tempcount;
|
---|
112 |
|
---|
113 | /* During in-core sort, this points to the base of the data block
|
---|
114 | which contains all the lines of data. */
|
---|
115 | char *text_base;
|
---|
116 |
|
---|
117 | /* Initially 0; changed to 1 if we want initials in this index. */
|
---|
118 | int need_initials;
|
---|
119 |
|
---|
120 | /* Remembers the first initial letter seen in this index, so we can
|
---|
121 | determine whether we need initials in the sorted form. */
|
---|
122 | char first_initial;
|
---|
123 |
|
---|
124 | /* Additional command switches .*/
|
---|
125 |
|
---|
126 | /* Nonzero means do not delete tempfiles -- for debugging. */
|
---|
127 | int keep_tempfiles;
|
---|
128 |
|
---|
129 | /* Forward declarations of functions in this file. */
|
---|
130 | void decode_command (int argc, char **argv);
|
---|
131 | void sort_in_core (char *infile, int total, char *outfile);
|
---|
132 | void sort_offline (char *infile, off_t total, char *outfile);
|
---|
133 | char **parsefile (char *filename, char **nextline, char *data, long int size);
|
---|
134 | char *find_field (struct keyfield *keyfield, char *str, long int *lengthptr);
|
---|
135 | char *find_pos (char *str, int words, int chars, int ignore_blanks);
|
---|
136 | long find_value (char *start, long int length);
|
---|
137 | char *find_braced_pos (char *str, int words, int chars, int ignore_blanks);
|
---|
138 | char *find_braced_end (char *str);
|
---|
139 | void writelines (char **linearray, int nlines, FILE *ostream);
|
---|
140 | int compare_field (struct keyfield *keyfield, char *start1,
|
---|
141 | long int length1, long int pos1, char *start2,
|
---|
142 | long int length2, long int pos2);
|
---|
143 | int compare_full (const void *, const void *);
|
---|
144 | long readline (struct linebuffer *linebuffer, FILE *stream);
|
---|
145 | int merge_files (char **infiles, int nfiles, char *outfile);
|
---|
146 | int merge_direct (char **infiles, int nfiles, char *outfile);
|
---|
147 | void pfatal_with_name (const char *name);
|
---|
148 | void fatal (const char *format, const char *arg);
|
---|
149 | void error (const char *format, const char *arg);
|
---|
150 | void *xmalloc (), *xrealloc ();
|
---|
151 | char *concat (char *s1, char *s2);
|
---|
152 | void flush_tempfiles (int to_count);
|
---|
153 | |
---|
154 |
|
---|
155 | #define MAX_IN_CORE_SORT 500000
|
---|
156 |
|
---|
157 | int
|
---|
158 | main (int argc, char **argv)
|
---|
159 | {
|
---|
160 | int i;
|
---|
161 |
|
---|
162 | tempcount = 0;
|
---|
163 | last_deleted_tempcount = 0;
|
---|
164 |
|
---|
165 | #ifdef __EMX__
|
---|
166 | _response(&argc, &argv);
|
---|
167 | _wildcard(&argc, &argv);
|
---|
168 | #endif
|
---|
169 |
|
---|
170 | #ifdef HAVE_SETLOCALE
|
---|
171 | /* Set locale via LC_ALL. */
|
---|
172 | setlocale (LC_ALL, "");
|
---|
173 | #endif
|
---|
174 |
|
---|
175 | /* Set the text message domain. */
|
---|
176 | bindtextdomain (PACKAGE, LOCALEDIR);
|
---|
177 | textdomain (PACKAGE);
|
---|
178 |
|
---|
179 | /* In case we write to a redirected stdout that fails. */
|
---|
180 | /* not ready atexit (close_stdout); */
|
---|
181 |
|
---|
182 | /* Describe the kind of sorting to do. */
|
---|
183 | /* The first keyfield uses the first braced field and folds case. */
|
---|
184 | keyfields[0].braced = 1;
|
---|
185 | keyfields[0].fold_case = 1;
|
---|
186 | keyfields[0].endwords = -1;
|
---|
187 | keyfields[0].endchars = -1;
|
---|
188 |
|
---|
189 | /* The second keyfield uses the second braced field, numerically. */
|
---|
190 | keyfields[1].braced = 1;
|
---|
191 | keyfields[1].numeric = 1;
|
---|
192 | keyfields[1].startwords = 1;
|
---|
193 | keyfields[1].endwords = -1;
|
---|
194 | keyfields[1].endchars = -1;
|
---|
195 |
|
---|
196 | /* The third keyfield (which is ignored while discarding duplicates)
|
---|
197 | compares the whole line. */
|
---|
198 | keyfields[2].endwords = -1;
|
---|
199 | keyfields[2].endchars = -1;
|
---|
200 |
|
---|
201 | decode_command (argc, argv);
|
---|
202 |
|
---|
203 | /* Process input files completely, one by one. */
|
---|
204 |
|
---|
205 | for (i = 0; i < num_infiles; i++)
|
---|
206 | {
|
---|
207 | int desc;
|
---|
208 | off_t ptr;
|
---|
209 | char *outfile;
|
---|
210 | struct stat instat;
|
---|
211 |
|
---|
212 | desc = open (infiles[i], O_RDONLY, 0);
|
---|
213 | if (desc < 0)
|
---|
214 | pfatal_with_name (infiles[i]);
|
---|
215 |
|
---|
216 | if (stat (infiles[i], &instat))
|
---|
217 | pfatal_with_name (infiles[i]);
|
---|
218 | if (S_ISDIR (instat.st_mode))
|
---|
219 | {
|
---|
220 | #ifdef EISDIR
|
---|
221 | errno = EISDIR;
|
---|
222 | #endif
|
---|
223 | pfatal_with_name (infiles[i]);
|
---|
224 | }
|
---|
225 |
|
---|
226 | lseek (desc, (off_t) 0, SEEK_END);
|
---|
227 | ptr = (off_t) lseek (desc, (off_t) 0, SEEK_CUR);
|
---|
228 |
|
---|
229 | close (desc);
|
---|
230 |
|
---|
231 | outfile = outfiles[i];
|
---|
232 | if (!outfile)
|
---|
233 | outfile = concat (infiles[i], "s");
|
---|
234 |
|
---|
235 | need_initials = 0;
|
---|
236 | first_initial = '\0';
|
---|
237 |
|
---|
238 | if (ptr < MAX_IN_CORE_SORT)
|
---|
239 | /* Sort a small amount of data. */
|
---|
240 | sort_in_core (infiles[i], (int)ptr, outfile);
|
---|
241 | else
|
---|
242 | sort_offline (infiles[i], ptr, outfile);
|
---|
243 | }
|
---|
244 |
|
---|
245 | flush_tempfiles (tempcount);
|
---|
246 | xexit (0);
|
---|
247 | return 0; /* Avoid bogus warnings. */
|
---|
248 | }
|
---|
249 | |
---|
250 |
|
---|
251 | typedef struct
|
---|
252 | {
|
---|
253 | char *long_name;
|
---|
254 | char *short_name;
|
---|
255 | int *variable_ref;
|
---|
256 | int variable_value;
|
---|
257 | char *arg_name;
|
---|
258 | char *doc_string;
|
---|
259 | } TEXINDEX_OPTION;
|
---|
260 |
|
---|
261 | TEXINDEX_OPTION texindex_options[] = {
|
---|
262 | { "--help", "-h", (int *)NULL, 0, (char *)NULL,
|
---|
263 | N_("display this help and exit") },
|
---|
264 | { "--keep", "-k", &keep_tempfiles, 1, (char *)NULL,
|
---|
265 | N_("keep temporary files around after processing") },
|
---|
266 | { "--no-keep", 0, &keep_tempfiles, 0, (char *)NULL,
|
---|
267 | N_("do not keep temporary files around after processing (default)") },
|
---|
268 | { "--output", "-o", (int *)NULL, 0, "FILE",
|
---|
269 | N_("send output to FILE") },
|
---|
270 | { "--version", (char *)NULL, (int *)NULL, 0, (char *)NULL,
|
---|
271 | N_("display version information and exit") },
|
---|
272 | { (char *)NULL, (char *)NULL, (int *)NULL, 0, (char *)NULL }
|
---|
273 | };
|
---|
274 |
|
---|
275 | void
|
---|
276 | usage (int result_value)
|
---|
277 | {
|
---|
278 | register int i;
|
---|
279 | FILE *f = result_value ? stderr : stdout;
|
---|
280 |
|
---|
281 | fprintf (f, _("Usage: %s [OPTION]... FILE...\n"), program_name);
|
---|
282 | fprintf (f, _("Generate a sorted index for each TeX output FILE.\n"));
|
---|
283 | /* Avoid trigraph nonsense. */
|
---|
284 | fprintf (f,
|
---|
285 | _("Usually FILE... is specified as `foo.%c%c\' for a document `foo.texi'.\n"),
|
---|
286 | '?', '?'); /* avoid trigraph in cat-id-tbl.c */
|
---|
287 | fprintf (f, _("\nOptions:\n"));
|
---|
288 |
|
---|
289 | for (i = 0; texindex_options[i].long_name; i++)
|
---|
290 | {
|
---|
291 | putc (' ', f);
|
---|
292 |
|
---|
293 | if (texindex_options[i].short_name)
|
---|
294 | fprintf (f, "%s, ", texindex_options[i].short_name);
|
---|
295 |
|
---|
296 | fprintf (f, "%s %s",
|
---|
297 | texindex_options[i].long_name,
|
---|
298 | texindex_options[i].arg_name
|
---|
299 | ? texindex_options[i].arg_name : "");
|
---|
300 |
|
---|
301 | fprintf (f, "\t%s\n", _(texindex_options[i].doc_string));
|
---|
302 | }
|
---|
303 | fputs (_("\n\
|
---|
304 | Email bug reports to bug-texinfo@gnu.org,\n\
|
---|
305 | general questions and discussion to help-texinfo@gnu.org.\n\
|
---|
306 | Texinfo home page: http://www.gnu.org/software/texinfo/"), f);
|
---|
307 | fputs ("\n", f);
|
---|
308 |
|
---|
309 | xexit (result_value);
|
---|
310 | }
|
---|
311 |
|
---|
312 | /* Decode the command line arguments to set the parameter variables
|
---|
313 | and set up the vector of keyfields and the vector of input files. */
|
---|
314 |
|
---|
315 | void
|
---|
316 | decode_command (int argc, char **argv)
|
---|
317 | {
|
---|
318 | int arg_index = 1;
|
---|
319 | char **ip;
|
---|
320 | char **op;
|
---|
321 |
|
---|
322 | /* Store default values into parameter variables. */
|
---|
323 |
|
---|
324 | tempdir = getenv ("TMPDIR");
|
---|
325 | if (tempdir == NULL)
|
---|
326 | tempdir = getenv ("TEMP");
|
---|
327 | if (tempdir == NULL)
|
---|
328 | tempdir = getenv ("TMP");
|
---|
329 | if (tempdir == NULL)
|
---|
330 | tempdir = DEFAULT_TMPDIR;
|
---|
331 | else
|
---|
332 | tempdir = concat (tempdir, "/");
|
---|
333 |
|
---|
334 | keep_tempfiles = 0;
|
---|
335 |
|
---|
336 | /* Allocate ARGC input files, which must be enough. */
|
---|
337 |
|
---|
338 | infiles = (char **) xmalloc (argc * sizeof (char *));
|
---|
339 | outfiles = (char **) xmalloc (argc * sizeof (char *));
|
---|
340 | ip = infiles;
|
---|
341 | op = outfiles;
|
---|
342 |
|
---|
343 | while (arg_index < argc)
|
---|
344 | {
|
---|
345 | char *arg = argv[arg_index++];
|
---|
346 |
|
---|
347 | if (*arg == '-')
|
---|
348 | {
|
---|
349 | if (strcmp (arg, "--version") == 0)
|
---|
350 | {
|
---|
351 | printf ("texindex (GNU %s) %s\n", PACKAGE, VERSION);
|
---|
352 | puts ("");
|
---|
353 | puts ("Copyright (C) 2004 Free Software Foundation, Inc.");
|
---|
354 | printf (_("There is NO warranty. You may redistribute this software\n\
|
---|
355 | under the terms of the GNU General Public License.\n\
|
---|
356 | For more information about these matters, see the files named COPYING.\n"));
|
---|
357 | xexit (0);
|
---|
358 | }
|
---|
359 | else if ((strcmp (arg, "--keep") == 0) ||
|
---|
360 | (strcmp (arg, "-k") == 0))
|
---|
361 | {
|
---|
362 | keep_tempfiles = 1;
|
---|
363 | }
|
---|
364 | else if ((strcmp (arg, "--help") == 0) ||
|
---|
365 | (strcmp (arg, "-h") == 0))
|
---|
366 | {
|
---|
367 | usage (0);
|
---|
368 | }
|
---|
369 | else if ((strcmp (arg, "--output") == 0) ||
|
---|
370 | (strcmp (arg, "-o") == 0))
|
---|
371 | {
|
---|
372 | if (argv[arg_index] != (char *)NULL)
|
---|
373 | {
|
---|
374 | arg_index++;
|
---|
375 | if (op > outfiles)
|
---|
376 | *(op - 1) = argv[arg_index];
|
---|
377 | }
|
---|
378 | else
|
---|
379 | usage (1);
|
---|
380 | }
|
---|
381 | else
|
---|
382 | usage (1);
|
---|
383 | }
|
---|
384 | else
|
---|
385 | {
|
---|
386 | *ip++ = arg;
|
---|
387 | *op++ = (char *)NULL;
|
---|
388 | }
|
---|
389 | }
|
---|
390 |
|
---|
391 | /* Record number of keyfields and terminate list of filenames. */
|
---|
392 | num_infiles = ip - infiles;
|
---|
393 | *ip = (char *)NULL;
|
---|
394 | if (num_infiles == 0)
|
---|
395 | usage (1);
|
---|
396 | }
|
---|
397 | |
---|
398 |
|
---|
399 | /* Return a name for temporary file COUNT. */
|
---|
400 |
|
---|
401 | static char *
|
---|
402 | maketempname (int count)
|
---|
403 | {
|
---|
404 | static char *tempbase = NULL;
|
---|
405 | char tempsuffix[10];
|
---|
406 |
|
---|
407 | if (!tempbase)
|
---|
408 | {
|
---|
409 | int fd;
|
---|
410 | tempbase = concat (tempdir, "txidxXXXXXX");
|
---|
411 |
|
---|
412 | fd = mkstemp (tempbase);
|
---|
413 | if (fd == -1)
|
---|
414 | pfatal_with_name (tempbase);
|
---|
415 | }
|
---|
416 |
|
---|
417 | sprintf (tempsuffix, ".%d", count);
|
---|
418 | return concat (tempbase, tempsuffix);
|
---|
419 | }
|
---|
420 |
|
---|
421 |
|
---|
422 | /* Delete all temporary files up to TO_COUNT. */
|
---|
423 |
|
---|
424 | void
|
---|
425 | flush_tempfiles (int to_count)
|
---|
426 | {
|
---|
427 | if (keep_tempfiles)
|
---|
428 | return;
|
---|
429 | while (last_deleted_tempcount < to_count)
|
---|
430 | unlink (maketempname (++last_deleted_tempcount));
|
---|
431 | }
|
---|
432 |
|
---|
433 | |
---|
434 |
|
---|
435 | /* Compare LINE1 and LINE2 according to the specified set of keyfields. */
|
---|
436 |
|
---|
437 | int
|
---|
438 | compare_full (const void *p1, const void *p2)
|
---|
439 | {
|
---|
440 | char **line1 = (char **) p1;
|
---|
441 | char **line2 = (char **) p2;
|
---|
442 | int i;
|
---|
443 |
|
---|
444 | /* Compare using the first keyfield;
|
---|
445 | if that does not distinguish the lines, try the second keyfield;
|
---|
446 | and so on. */
|
---|
447 |
|
---|
448 | for (i = 0; i < num_keyfields; i++)
|
---|
449 | {
|
---|
450 | long length1, length2;
|
---|
451 | char *start1 = find_field (&keyfields[i], *line1, &length1);
|
---|
452 | char *start2 = find_field (&keyfields[i], *line2, &length2);
|
---|
453 | int tem = compare_field (&keyfields[i], start1, length1,
|
---|
454 | *line1 - text_base,
|
---|
455 | start2, length2, *line2 - text_base);
|
---|
456 | if (tem)
|
---|
457 | {
|
---|
458 | if (keyfields[i].reverse)
|
---|
459 | return -tem;
|
---|
460 | return tem;
|
---|
461 | }
|
---|
462 | }
|
---|
463 |
|
---|
464 | return 0; /* Lines match exactly. */
|
---|
465 | }
|
---|
466 |
|
---|
467 | /* Compare LINE1 and LINE2, described by structures
|
---|
468 | in which the first keyfield is identified in advance.
|
---|
469 | For positional sorting, assumes that the order of the lines in core
|
---|
470 | reflects their nominal order. */
|
---|
471 | int
|
---|
472 | compare_prepared (const void *p1, const void *p2)
|
---|
473 | {
|
---|
474 | struct lineinfo *line1 = (struct lineinfo *) p1;
|
---|
475 | struct lineinfo *line2 = (struct lineinfo *) p2;
|
---|
476 | int i;
|
---|
477 | int tem;
|
---|
478 | char *text1, *text2;
|
---|
479 |
|
---|
480 | /* Compare using the first keyfield, which has been found for us already. */
|
---|
481 | if (keyfields->positional)
|
---|
482 | {
|
---|
483 | if (line1->text - text_base > line2->text - text_base)
|
---|
484 | tem = 1;
|
---|
485 | else
|
---|
486 | tem = -1;
|
---|
487 | }
|
---|
488 | else if (keyfields->numeric)
|
---|
489 | tem = line1->key.number - line2->key.number;
|
---|
490 | else
|
---|
491 | tem = compare_field (keyfields, line1->key.text, line1->keylen, 0,
|
---|
492 | line2->key.text, line2->keylen, 0);
|
---|
493 | if (tem)
|
---|
494 | {
|
---|
495 | if (keyfields->reverse)
|
---|
496 | return -tem;
|
---|
497 | return tem;
|
---|
498 | }
|
---|
499 |
|
---|
500 | text1 = line1->text;
|
---|
501 | text2 = line2->text;
|
---|
502 |
|
---|
503 | /* Compare using the second keyfield;
|
---|
504 | if that does not distinguish the lines, try the third keyfield;
|
---|
505 | and so on. */
|
---|
506 |
|
---|
507 | for (i = 1; i < num_keyfields; i++)
|
---|
508 | {
|
---|
509 | long length1, length2;
|
---|
510 | char *start1 = find_field (&keyfields[i], text1, &length1);
|
---|
511 | char *start2 = find_field (&keyfields[i], text2, &length2);
|
---|
512 | int tem = compare_field (&keyfields[i], start1, length1,
|
---|
513 | text1 - text_base,
|
---|
514 | start2, length2, text2 - text_base);
|
---|
515 | if (tem)
|
---|
516 | {
|
---|
517 | if (keyfields[i].reverse)
|
---|
518 | return -tem;
|
---|
519 | return tem;
|
---|
520 | }
|
---|
521 | }
|
---|
522 |
|
---|
523 | return 0; /* Lines match exactly. */
|
---|
524 | }
|
---|
525 |
|
---|
526 | /* Like compare_full but more general.
|
---|
527 | You can pass any strings, and you can say how many keyfields to use.
|
---|
528 | POS1 and POS2 should indicate the nominal positional ordering of
|
---|
529 | the two lines in the input. */
|
---|
530 |
|
---|
531 | int
|
---|
532 | compare_general (char *str1, char *str2, long int pos1, long int pos2, int use_keyfields)
|
---|
533 | {
|
---|
534 | int i;
|
---|
535 |
|
---|
536 | /* Compare using the first keyfield;
|
---|
537 | if that does not distinguish the lines, try the second keyfield;
|
---|
538 | and so on. */
|
---|
539 |
|
---|
540 | for (i = 0; i < use_keyfields; i++)
|
---|
541 | {
|
---|
542 | long length1, length2;
|
---|
543 | char *start1 = find_field (&keyfields[i], str1, &length1);
|
---|
544 | char *start2 = find_field (&keyfields[i], str2, &length2);
|
---|
545 | int tem = compare_field (&keyfields[i], start1, length1, pos1,
|
---|
546 | start2, length2, pos2);
|
---|
547 | if (tem)
|
---|
548 | {
|
---|
549 | if (keyfields[i].reverse)
|
---|
550 | return -tem;
|
---|
551 | return tem;
|
---|
552 | }
|
---|
553 | }
|
---|
554 |
|
---|
555 | return 0; /* Lines match exactly. */
|
---|
556 | }
|
---|
557 |
|
---|
558 | /* Find the start and length of a field in STR according to KEYFIELD.
|
---|
559 | A pointer to the starting character is returned, and the length
|
---|
560 | is stored into the int that LENGTHPTR points to. */
|
---|
561 |
|
---|
562 | char *
|
---|
563 | find_field (struct keyfield *keyfield, char *str, long int *lengthptr)
|
---|
564 | {
|
---|
565 | char *start;
|
---|
566 | char *end;
|
---|
567 | char *(*fun) ();
|
---|
568 |
|
---|
569 | if (keyfield->braced)
|
---|
570 | fun = find_braced_pos;
|
---|
571 | else
|
---|
572 | fun = find_pos;
|
---|
573 |
|
---|
574 | start = (*fun) (str, keyfield->startwords, keyfield->startchars,
|
---|
575 | keyfield->ignore_blanks);
|
---|
576 | if (keyfield->endwords < 0)
|
---|
577 | {
|
---|
578 | if (keyfield->braced)
|
---|
579 | end = find_braced_end (start);
|
---|
580 | else
|
---|
581 | {
|
---|
582 | end = start;
|
---|
583 | while (*end && *end != '\n')
|
---|
584 | end++;
|
---|
585 | }
|
---|
586 | }
|
---|
587 | else
|
---|
588 | {
|
---|
589 | end = (*fun) (str, keyfield->endwords, keyfield->endchars, 0);
|
---|
590 | if (end - str < start - str)
|
---|
591 | end = start;
|
---|
592 | }
|
---|
593 | *lengthptr = end - start;
|
---|
594 | return start;
|
---|
595 | }
|
---|
596 |
|
---|
597 | /* Return a pointer to a specified place within STR,
|
---|
598 | skipping (from the beginning) WORDS words and then CHARS chars.
|
---|
599 | If IGNORE_BLANKS is nonzero, we skip all blanks
|
---|
600 | after finding the specified word. */
|
---|
601 |
|
---|
602 | char *
|
---|
603 | find_pos (char *str, int words, int chars, int ignore_blanks)
|
---|
604 | {
|
---|
605 | int i;
|
---|
606 | char *p = str;
|
---|
607 |
|
---|
608 | for (i = 0; i < words; i++)
|
---|
609 | {
|
---|
610 | char c;
|
---|
611 | /* Find next bunch of nonblanks and skip them. */
|
---|
612 | while ((c = *p) == ' ' || c == '\t')
|
---|
613 | p++;
|
---|
614 | while ((c = *p) && c != '\n' && !(c == ' ' || c == '\t'))
|
---|
615 | p++;
|
---|
616 | if (!*p || *p == '\n')
|
---|
617 | return p;
|
---|
618 | }
|
---|
619 |
|
---|
620 | while (*p == ' ' || *p == '\t')
|
---|
621 | p++;
|
---|
622 |
|
---|
623 | for (i = 0; i < chars; i++)
|
---|
624 | {
|
---|
625 | if (!*p || *p == '\n')
|
---|
626 | break;
|
---|
627 | p++;
|
---|
628 | }
|
---|
629 | return p;
|
---|
630 | }
|
---|
631 |
|
---|
632 | /* Like find_pos but assumes that each field is surrounded by braces
|
---|
633 | and that braces within fields are balanced. */
|
---|
634 |
|
---|
635 | char *
|
---|
636 | find_braced_pos (char *str, int words, int chars, int ignore_blanks)
|
---|
637 | {
|
---|
638 | int i;
|
---|
639 | int bracelevel;
|
---|
640 | char *p = str;
|
---|
641 | char c;
|
---|
642 |
|
---|
643 | for (i = 0; i < words; i++)
|
---|
644 | {
|
---|
645 | bracelevel = 1;
|
---|
646 | while ((c = *p++) != '{' && c != '\n' && c)
|
---|
647 | /* Do nothing. */ ;
|
---|
648 | if (c != '{')
|
---|
649 | return p - 1;
|
---|
650 | while (bracelevel)
|
---|
651 | {
|
---|
652 | c = *p++;
|
---|
653 | if (c == '{')
|
---|
654 | bracelevel++;
|
---|
655 | if (c == '}')
|
---|
656 | bracelevel--;
|
---|
657 | if (c == 0 || c == '\n')
|
---|
658 | return p - 1;
|
---|
659 | }
|
---|
660 | }
|
---|
661 |
|
---|
662 | while ((c = *p++) != '{' && c != '\n' && c)
|
---|
663 | /* Do nothing. */ ;
|
---|
664 |
|
---|
665 | if (c != '{')
|
---|
666 | return p - 1;
|
---|
667 |
|
---|
668 | if (ignore_blanks)
|
---|
669 | while ((c = *p) == ' ' || c == '\t')
|
---|
670 | p++;
|
---|
671 |
|
---|
672 | for (i = 0; i < chars; i++)
|
---|
673 | {
|
---|
674 | if (!*p || *p == '\n')
|
---|
675 | break;
|
---|
676 | p++;
|
---|
677 | }
|
---|
678 | return p;
|
---|
679 | }
|
---|
680 |
|
---|
681 | /* Find the end of the balanced-brace field which starts at STR.
|
---|
682 | The position returned is just before the closing brace. */
|
---|
683 |
|
---|
684 | char *
|
---|
685 | find_braced_end (char *str)
|
---|
686 | {
|
---|
687 | int bracelevel;
|
---|
688 | char *p = str;
|
---|
689 | char c;
|
---|
690 |
|
---|
691 | bracelevel = 1;
|
---|
692 | while (bracelevel)
|
---|
693 | {
|
---|
694 | c = *p++;
|
---|
695 | if (c == '{')
|
---|
696 | bracelevel++;
|
---|
697 | if (c == '}')
|
---|
698 | bracelevel--;
|
---|
699 | if (c == 0 || c == '\n')
|
---|
700 | return p - 1;
|
---|
701 | }
|
---|
702 | return p - 1;
|
---|
703 | }
|
---|
704 |
|
---|
705 | long
|
---|
706 | find_value (char *start, long int length)
|
---|
707 | {
|
---|
708 | while (length != 0L)
|
---|
709 | {
|
---|
710 | if (isdigit (*start))
|
---|
711 | return atol (start);
|
---|
712 | length--;
|
---|
713 | start++;
|
---|
714 | }
|
---|
715 | return 0l;
|
---|
716 | }
|
---|
717 |
|
---|
718 | /* Vector used to translate characters for comparison.
|
---|
719 | This is how we make all alphanumerics follow all else,
|
---|
720 | and ignore case in the first sorting. */
|
---|
721 | int char_order[256];
|
---|
722 |
|
---|
723 | void
|
---|
724 | init_char_order (void)
|
---|
725 | {
|
---|
726 | int i;
|
---|
727 | for (i = 1; i < 256; i++)
|
---|
728 | char_order[i] = i;
|
---|
729 |
|
---|
730 | for (i = '0'; i <= '9'; i++)
|
---|
731 | char_order[i] += 512;
|
---|
732 |
|
---|
733 | for (i = 'a'; i <= 'z'; i++)
|
---|
734 | {
|
---|
735 | char_order[i] = 512 + i;
|
---|
736 | char_order[i + 'A' - 'a'] = 512 + i;
|
---|
737 | }
|
---|
738 | }
|
---|
739 |
|
---|
740 | /* Compare two fields (each specified as a start pointer and a character count)
|
---|
741 | according to KEYFIELD.
|
---|
742 | The sign of the value reports the relation between the fields. */
|
---|
743 |
|
---|
744 | int
|
---|
745 | compare_field (struct keyfield *keyfield, char *start1, long int length1,
|
---|
746 | long int pos1, char *start2, long int length2, long int pos2)
|
---|
747 | {
|
---|
748 | if (keyfields->positional)
|
---|
749 | {
|
---|
750 | if (pos1 > pos2)
|
---|
751 | return 1;
|
---|
752 | else
|
---|
753 | return -1;
|
---|
754 | }
|
---|
755 | if (keyfield->numeric)
|
---|
756 | {
|
---|
757 | long value = find_value (start1, length1) - find_value (start2, length2);
|
---|
758 | if (value > 0)
|
---|
759 | return 1;
|
---|
760 | if (value < 0)
|
---|
761 | return -1;
|
---|
762 | return 0;
|
---|
763 | }
|
---|
764 | else
|
---|
765 | {
|
---|
766 | char *p1 = start1;
|
---|
767 | char *p2 = start2;
|
---|
768 | char *e1 = start1 + length1;
|
---|
769 | char *e2 = start2 + length2;
|
---|
770 |
|
---|
771 | while (1)
|
---|
772 | {
|
---|
773 | int c1, c2;
|
---|
774 |
|
---|
775 | if (p1 == e1)
|
---|
776 | c1 = 0;
|
---|
777 | else
|
---|
778 | c1 = *p1++;
|
---|
779 | if (p2 == e2)
|
---|
780 | c2 = 0;
|
---|
781 | else
|
---|
782 | c2 = *p2++;
|
---|
783 |
|
---|
784 | if (char_order[c1] != char_order[c2])
|
---|
785 | return char_order[c1] - char_order[c2];
|
---|
786 | if (!c1)
|
---|
787 | break;
|
---|
788 | }
|
---|
789 |
|
---|
790 | /* Strings are equal except possibly for case. */
|
---|
791 | p1 = start1;
|
---|
792 | p2 = start2;
|
---|
793 | while (1)
|
---|
794 | {
|
---|
795 | int c1, c2;
|
---|
796 |
|
---|
797 | if (p1 == e1)
|
---|
798 | c1 = 0;
|
---|
799 | else
|
---|
800 | c1 = *p1++;
|
---|
801 | if (p2 == e2)
|
---|
802 | c2 = 0;
|
---|
803 | else
|
---|
804 | c2 = *p2++;
|
---|
805 |
|
---|
806 | if (c1 != c2)
|
---|
807 | /* Reverse sign here so upper case comes out last. */
|
---|
808 | return c2 - c1;
|
---|
809 | if (!c1)
|
---|
810 | break;
|
---|
811 | }
|
---|
812 |
|
---|
813 | return 0;
|
---|
814 | }
|
---|
815 | }
|
---|
816 | |
---|
817 |
|
---|
818 | /* A `struct linebuffer' is a structure which holds a line of text.
|
---|
819 | `readline' reads a line from a stream into a linebuffer
|
---|
820 | and works regardless of the length of the line. */
|
---|
821 |
|
---|
822 | struct linebuffer
|
---|
823 | {
|
---|
824 | long size;
|
---|
825 | char *buffer;
|
---|
826 | };
|
---|
827 |
|
---|
828 | /* Initialize LINEBUFFER for use. */
|
---|
829 |
|
---|
830 | void
|
---|
831 | initbuffer (struct linebuffer *linebuffer)
|
---|
832 | {
|
---|
833 | linebuffer->size = 200;
|
---|
834 | linebuffer->buffer = (char *) xmalloc (200);
|
---|
835 | }
|
---|
836 |
|
---|
837 | /* Read a line of text from STREAM into LINEBUFFER.
|
---|
838 | Return the length of the line. */
|
---|
839 |
|
---|
840 | long
|
---|
841 | readline (struct linebuffer *linebuffer, FILE *stream)
|
---|
842 | {
|
---|
843 | char *buffer = linebuffer->buffer;
|
---|
844 | char *p = linebuffer->buffer;
|
---|
845 | char *end = p + linebuffer->size;
|
---|
846 |
|
---|
847 | while (1)
|
---|
848 | {
|
---|
849 | int c = getc (stream);
|
---|
850 | if (p == end)
|
---|
851 | {
|
---|
852 | buffer = (char *) xrealloc (buffer, linebuffer->size *= 2);
|
---|
853 | p += buffer - linebuffer->buffer;
|
---|
854 | end += buffer - linebuffer->buffer;
|
---|
855 | linebuffer->buffer = buffer;
|
---|
856 | }
|
---|
857 | if (c < 0 || c == '\n')
|
---|
858 | {
|
---|
859 | *p = 0;
|
---|
860 | break;
|
---|
861 | }
|
---|
862 | *p++ = c;
|
---|
863 | }
|
---|
864 |
|
---|
865 | return p - buffer;
|
---|
866 | }
|
---|
867 | |
---|
868 |
|
---|
869 | /* Sort an input file too big to sort in core. */
|
---|
870 |
|
---|
871 | void
|
---|
872 | sort_offline (char *infile, off_t total, char *outfile)
|
---|
873 | {
|
---|
874 | /* More than enough. */
|
---|
875 | int ntemps = 2 * (total + MAX_IN_CORE_SORT - 1) / MAX_IN_CORE_SORT;
|
---|
876 | char **tempfiles = (char **) xmalloc (ntemps * sizeof (char *));
|
---|
877 | FILE *istream = fopen (infile, "r");
|
---|
878 | int i;
|
---|
879 | struct linebuffer lb;
|
---|
880 | long linelength;
|
---|
881 | int failure = 0;
|
---|
882 |
|
---|
883 | initbuffer (&lb);
|
---|
884 |
|
---|
885 | /* Read in one line of input data. */
|
---|
886 |
|
---|
887 | linelength = readline (&lb, istream);
|
---|
888 |
|
---|
889 | if (lb.buffer[0] != '\\' && lb.buffer[0] != '@')
|
---|
890 | {
|
---|
891 | error (_("%s: not a texinfo index file"), infile);
|
---|
892 | return;
|
---|
893 | }
|
---|
894 |
|
---|
895 | /* Split up the input into `ntemps' temporary files, or maybe fewer,
|
---|
896 | and put the new files' names into `tempfiles' */
|
---|
897 |
|
---|
898 | for (i = 0; i < ntemps; i++)
|
---|
899 | {
|
---|
900 | char *outname = maketempname (++tempcount);
|
---|
901 | FILE *ostream = fopen (outname, "w");
|
---|
902 | long tempsize = 0;
|
---|
903 |
|
---|
904 | if (!ostream)
|
---|
905 | pfatal_with_name (outname);
|
---|
906 | tempfiles[i] = outname;
|
---|
907 |
|
---|
908 | /* Copy lines into this temp file as long as it does not make file
|
---|
909 | "too big" or until there are no more lines. */
|
---|
910 |
|
---|
911 | while (tempsize + linelength + 1 <= MAX_IN_CORE_SORT)
|
---|
912 | {
|
---|
913 | tempsize += linelength + 1;
|
---|
914 | fputs (lb.buffer, ostream);
|
---|
915 | putc ('\n', ostream);
|
---|
916 |
|
---|
917 | /* Read another line of input data. */
|
---|
918 |
|
---|
919 | linelength = readline (&lb, istream);
|
---|
920 | if (!linelength && feof (istream))
|
---|
921 | break;
|
---|
922 |
|
---|
923 | if (lb.buffer[0] != '\\' && lb.buffer[0] != '@')
|
---|
924 | {
|
---|
925 | error (_("%s: not a texinfo index file"), infile);
|
---|
926 | failure = 1;
|
---|
927 | goto fail;
|
---|
928 | }
|
---|
929 | }
|
---|
930 | fclose (ostream);
|
---|
931 | if (feof (istream))
|
---|
932 | break;
|
---|
933 | }
|
---|
934 |
|
---|
935 | free (lb.buffer);
|
---|
936 |
|
---|
937 | fail:
|
---|
938 | /* Record number of temp files we actually needed. */
|
---|
939 |
|
---|
940 | ntemps = i;
|
---|
941 |
|
---|
942 | /* Sort each tempfile into another tempfile.
|
---|
943 | Delete the first set of tempfiles and put the names of the second
|
---|
944 | into `tempfiles'. */
|
---|
945 |
|
---|
946 | for (i = 0; i < ntemps; i++)
|
---|
947 | {
|
---|
948 | char *newtemp = maketempname (++tempcount);
|
---|
949 | sort_in_core (tempfiles[i], MAX_IN_CORE_SORT, newtemp);
|
---|
950 | if (!keep_tempfiles)
|
---|
951 | unlink (tempfiles[i]);
|
---|
952 | tempfiles[i] = newtemp;
|
---|
953 | }
|
---|
954 |
|
---|
955 | if (failure)
|
---|
956 | return;
|
---|
957 |
|
---|
958 | /* Merge the tempfiles together and indexify. */
|
---|
959 |
|
---|
960 | merge_files (tempfiles, ntemps, outfile);
|
---|
961 | }
|
---|
962 | |
---|
963 |
|
---|
964 | /* Sort INFILE, whose size is TOTAL,
|
---|
965 | assuming that is small enough to be done in-core,
|
---|
966 | then indexify it and send the output to OUTFILE (or to stdout). */
|
---|
967 |
|
---|
968 | void
|
---|
969 | sort_in_core (char *infile, int total, char *outfile)
|
---|
970 | {
|
---|
971 | char **nextline;
|
---|
972 | char *data = (char *) xmalloc (total + 1);
|
---|
973 | char *file_data;
|
---|
974 | long file_size;
|
---|
975 | int i;
|
---|
976 | FILE *ostream = stdout;
|
---|
977 | struct lineinfo *lineinfo;
|
---|
978 |
|
---|
979 | /* Read the contents of the file into the moby array `data'. */
|
---|
980 |
|
---|
981 | int desc = open (infile, O_RDONLY, 0);
|
---|
982 |
|
---|
983 | if (desc < 0)
|
---|
984 | fatal (_("failure reopening %s"), infile);
|
---|
985 | for (file_size = 0;;)
|
---|
986 | {
|
---|
987 | i = read (desc, data + file_size, total - file_size);
|
---|
988 | if (i <= 0)
|
---|
989 | break;
|
---|
990 | file_size += i;
|
---|
991 | }
|
---|
992 | file_data = data;
|
---|
993 | data[file_size] = 0;
|
---|
994 |
|
---|
995 | close (desc);
|
---|
996 |
|
---|
997 | if (file_size > 0 && data[0] != '\\' && data[0] != '@')
|
---|
998 | {
|
---|
999 | error (_("%s: not a texinfo index file"), infile);
|
---|
1000 | return;
|
---|
1001 | }
|
---|
1002 |
|
---|
1003 | init_char_order ();
|
---|
1004 |
|
---|
1005 | /* Sort routines want to know this address. */
|
---|
1006 |
|
---|
1007 | text_base = data;
|
---|
1008 |
|
---|
1009 | /* Create the array of pointers to lines, with a default size
|
---|
1010 | frequently enough. */
|
---|
1011 |
|
---|
1012 | nlines = total / 50;
|
---|
1013 | if (!nlines)
|
---|
1014 | nlines = 2;
|
---|
1015 | linearray = (char **) xmalloc (nlines * sizeof (char *));
|
---|
1016 |
|
---|
1017 | /* `nextline' points to the next free slot in this array.
|
---|
1018 | `nlines' is the allocated size. */
|
---|
1019 |
|
---|
1020 | nextline = linearray;
|
---|
1021 |
|
---|
1022 | /* Parse the input file's data, and make entries for the lines. */
|
---|
1023 |
|
---|
1024 | nextline = parsefile (infile, nextline, file_data, file_size);
|
---|
1025 | if (nextline == 0)
|
---|
1026 | {
|
---|
1027 | error (_("%s: not a texinfo index file"), infile);
|
---|
1028 | return;
|
---|
1029 | }
|
---|
1030 |
|
---|
1031 | /* Sort the lines. */
|
---|
1032 |
|
---|
1033 | /* If we have enough space, find the first keyfield of each line in advance.
|
---|
1034 | Make a `struct lineinfo' for each line, which records the keyfield
|
---|
1035 | as well as the line, and sort them. */
|
---|
1036 |
|
---|
1037 | lineinfo = malloc ((nextline - linearray) * sizeof (struct lineinfo));
|
---|
1038 |
|
---|
1039 | if (lineinfo)
|
---|
1040 | {
|
---|
1041 | struct lineinfo *lp;
|
---|
1042 | char **p;
|
---|
1043 |
|
---|
1044 | for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
|
---|
1045 | {
|
---|
1046 | lp->text = *p;
|
---|
1047 | lp->key.text = find_field (keyfields, *p, &lp->keylen);
|
---|
1048 | if (keyfields->numeric)
|
---|
1049 | lp->key.number = find_value (lp->key.text, lp->keylen);
|
---|
1050 | }
|
---|
1051 |
|
---|
1052 | qsort (lineinfo, nextline - linearray, sizeof (struct lineinfo),
|
---|
1053 | compare_prepared);
|
---|
1054 |
|
---|
1055 | for (lp = lineinfo, p = linearray; p != nextline; lp++, p++)
|
---|
1056 | *p = lp->text;
|
---|
1057 |
|
---|
1058 | free (lineinfo);
|
---|
1059 | }
|
---|
1060 | else
|
---|
1061 | qsort (linearray, nextline - linearray, sizeof (char *), compare_full);
|
---|
1062 |
|
---|
1063 | /* Open the output file. */
|
---|
1064 |
|
---|
1065 | if (outfile)
|
---|
1066 | {
|
---|
1067 | ostream = fopen (outfile, "w");
|
---|
1068 | if (!ostream)
|
---|
1069 | pfatal_with_name (outfile);
|
---|
1070 | }
|
---|
1071 |
|
---|
1072 | writelines (linearray, nextline - linearray, ostream);
|
---|
1073 | if (outfile)
|
---|
1074 | fclose (ostream);
|
---|
1075 |
|
---|
1076 | free (linearray);
|
---|
1077 | free (data);
|
---|
1078 | }
|
---|
1079 | |
---|
1080 |
|
---|
1081 | /* Parse an input string in core into lines.
|
---|
1082 | DATA is the input string, and SIZE is its length.
|
---|
1083 | Data goes in LINEARRAY starting at NEXTLINE.
|
---|
1084 | The value returned is the first entry in LINEARRAY still unused.
|
---|
1085 | Value 0 means input file contents are invalid. */
|
---|
1086 |
|
---|
1087 | char **
|
---|
1088 | parsefile (char *filename, char **nextline, char *data, long int size)
|
---|
1089 | {
|
---|
1090 | char *p, *end;
|
---|
1091 | char **line = nextline;
|
---|
1092 |
|
---|
1093 | p = data;
|
---|
1094 | end = p + size;
|
---|
1095 | *end = 0;
|
---|
1096 |
|
---|
1097 | while (p != end)
|
---|
1098 | {
|
---|
1099 | if (p[0] != '\\' && p[0] != '@')
|
---|
1100 | return 0;
|
---|
1101 |
|
---|
1102 | *line = p;
|
---|
1103 |
|
---|
1104 | /* Find the first letter of the first field of this line. If it
|
---|
1105 | is different from the first letter of the first field of the
|
---|
1106 | first line, we need initial headers in the output index. */
|
---|
1107 | while (*p && *p != '{')
|
---|
1108 | p++;
|
---|
1109 | if (p == end)
|
---|
1110 | return 0;
|
---|
1111 | p++;
|
---|
1112 | if (first_initial)
|
---|
1113 | {
|
---|
1114 | if (first_initial != toupper (*p))
|
---|
1115 | need_initials = 1;
|
---|
1116 | }
|
---|
1117 | else
|
---|
1118 | first_initial = toupper (*p);
|
---|
1119 |
|
---|
1120 | while (*p && *p != '\n')
|
---|
1121 | p++;
|
---|
1122 | if (p != end)
|
---|
1123 | p++;
|
---|
1124 |
|
---|
1125 | line++;
|
---|
1126 | if (line == linearray + nlines)
|
---|
1127 | {
|
---|
1128 | char **old = linearray;
|
---|
1129 | linearray = xrealloc (linearray, sizeof (char *) * (nlines *= 4));
|
---|
1130 | line += linearray - old;
|
---|
1131 | }
|
---|
1132 | }
|
---|
1133 |
|
---|
1134 | return line;
|
---|
1135 | }
|
---|
1136 | |
---|
1137 |
|
---|
1138 | /* Indexification is a filter applied to the sorted lines
|
---|
1139 | as they are being written to the output file.
|
---|
1140 | Multiple entries for the same name, with different page numbers,
|
---|
1141 | get combined into a single entry with multiple page numbers.
|
---|
1142 | The first braced field, which is used for sorting, is discarded.
|
---|
1143 | However, its first character is examined, folded to lower case,
|
---|
1144 | and if it is different from that in the previous line fed to us
|
---|
1145 | a \initial line is written with one argument, the new initial.
|
---|
1146 |
|
---|
1147 | If an entry has four braced fields, then the second and third
|
---|
1148 | constitute primary and secondary names.
|
---|
1149 | In this case, each change of primary name
|
---|
1150 | generates a \primary line which contains only the primary name,
|
---|
1151 | and in between these are \secondary lines which contain
|
---|
1152 | just a secondary name and page numbers. */
|
---|
1153 |
|
---|
1154 | /* The last primary name we wrote a \primary entry for.
|
---|
1155 | If only one level of indexing is being done, this is the last name seen. */
|
---|
1156 | char *lastprimary;
|
---|
1157 | /* Length of storage allocated for lastprimary. */
|
---|
1158 | int lastprimarylength;
|
---|
1159 |
|
---|
1160 | /* Similar, for the secondary name. */
|
---|
1161 | char *lastsecondary;
|
---|
1162 | int lastsecondarylength;
|
---|
1163 |
|
---|
1164 | /* Zero if we are not in the middle of writing an entry.
|
---|
1165 | One if we have written the beginning of an entry but have not
|
---|
1166 | yet written any page numbers into it.
|
---|
1167 | Greater than one if we have written the beginning of an entry
|
---|
1168 | plus at least one page number. */
|
---|
1169 | int pending;
|
---|
1170 |
|
---|
1171 | /* The initial (for sorting purposes) of the last primary entry written.
|
---|
1172 | When this changes, a \initial {c} line is written */
|
---|
1173 |
|
---|
1174 | char *lastinitial;
|
---|
1175 |
|
---|
1176 | int lastinitiallength;
|
---|
1177 |
|
---|
1178 | /* When we need a string of length 1 for the value of lastinitial,
|
---|
1179 | store it here. */
|
---|
1180 |
|
---|
1181 | char lastinitial1[2];
|
---|
1182 |
|
---|
1183 | /* Initialize static storage for writing an index. */
|
---|
1184 |
|
---|
1185 | void
|
---|
1186 | init_index (void)
|
---|
1187 | {
|
---|
1188 | pending = 0;
|
---|
1189 | lastinitial = lastinitial1;
|
---|
1190 | lastinitial1[0] = 0;
|
---|
1191 | lastinitial1[1] = 0;
|
---|
1192 | lastinitiallength = 0;
|
---|
1193 | lastprimarylength = 100;
|
---|
1194 | lastprimary = (char *) xmalloc (lastprimarylength + 1);
|
---|
1195 | memset (lastprimary, '\0', lastprimarylength + 1);
|
---|
1196 | lastsecondarylength = 100;
|
---|
1197 | lastsecondary = (char *) xmalloc (lastsecondarylength + 1);
|
---|
1198 | memset (lastsecondary, '\0', lastsecondarylength + 1);
|
---|
1199 | }
|
---|
1200 |
|
---|
1201 | /* Indexify. Merge entries for the same name,
|
---|
1202 | insert headers for each initial character, etc. */
|
---|
1203 |
|
---|
1204 | void
|
---|
1205 | indexify (char *line, FILE *ostream)
|
---|
1206 | {
|
---|
1207 | char *primary, *secondary, *pagenumber;
|
---|
1208 | int primarylength, secondarylength = 0, pagelength;
|
---|
1209 | int nosecondary;
|
---|
1210 | int initiallength;
|
---|
1211 | char *initial;
|
---|
1212 | char initial1[2];
|
---|
1213 | register char *p;
|
---|
1214 |
|
---|
1215 | /* First, analyze the parts of the entry fed to us this time. */
|
---|
1216 |
|
---|
1217 | p = find_braced_pos (line, 0, 0, 0);
|
---|
1218 | if (*p == '{')
|
---|
1219 | {
|
---|
1220 | initial = p;
|
---|
1221 | /* Get length of inner pair of braces starting at `p',
|
---|
1222 | including that inner pair of braces. */
|
---|
1223 | initiallength = find_braced_end (p + 1) + 1 - p;
|
---|
1224 | }
|
---|
1225 | else
|
---|
1226 | {
|
---|
1227 | initial = initial1;
|
---|
1228 | initial1[0] = toupper (*p);
|
---|
1229 | initial1[1] = 0;
|
---|
1230 | initiallength = 1;
|
---|
1231 | }
|
---|
1232 |
|
---|
1233 | pagenumber = find_braced_pos (line, 1, 0, 0);
|
---|
1234 | pagelength = find_braced_end (pagenumber) - pagenumber;
|
---|
1235 | if (pagelength == 0)
|
---|
1236 | fatal (_("No page number in %s"), line);
|
---|
1237 |
|
---|
1238 | primary = find_braced_pos (line, 2, 0, 0);
|
---|
1239 | primarylength = find_braced_end (primary) - primary;
|
---|
1240 |
|
---|
1241 | secondary = find_braced_pos (line, 3, 0, 0);
|
---|
1242 | nosecondary = !*secondary;
|
---|
1243 | if (!nosecondary)
|
---|
1244 | secondarylength = find_braced_end (secondary) - secondary;
|
---|
1245 |
|
---|
1246 | /* If the primary is different from before, make a new primary entry. */
|
---|
1247 | if (strncmp (primary, lastprimary, primarylength))
|
---|
1248 | {
|
---|
1249 | /* Close off current secondary entry first, if one is open. */
|
---|
1250 | if (pending)
|
---|
1251 | {
|
---|
1252 | fputs ("}\n", ostream);
|
---|
1253 | pending = 0;
|
---|
1254 | }
|
---|
1255 |
|
---|
1256 | /* If this primary has a different initial, include an entry for
|
---|
1257 | the initial. */
|
---|
1258 | if (need_initials &&
|
---|
1259 | (initiallength != lastinitiallength ||
|
---|
1260 | strncmp (initial, lastinitial, initiallength)))
|
---|
1261 | {
|
---|
1262 | fprintf (ostream, "\\initial {");
|
---|
1263 | fwrite (initial, 1, initiallength, ostream);
|
---|
1264 | fputs ("}\n", ostream);
|
---|
1265 | if (initial == initial1)
|
---|
1266 | {
|
---|
1267 | lastinitial = lastinitial1;
|
---|
1268 | *lastinitial1 = *initial1;
|
---|
1269 | }
|
---|
1270 | else
|
---|
1271 | {
|
---|
1272 | lastinitial = initial;
|
---|
1273 | }
|
---|
1274 | lastinitiallength = initiallength;
|
---|
1275 | }
|
---|
1276 |
|
---|
1277 | /* Make the entry for the primary. */
|
---|
1278 | if (nosecondary)
|
---|
1279 | fputs ("\\entry {", ostream);
|
---|
1280 | else
|
---|
1281 | fputs ("\\primary {", ostream);
|
---|
1282 | fwrite (primary, primarylength, 1, ostream);
|
---|
1283 | if (nosecondary)
|
---|
1284 | {
|
---|
1285 | fputs ("}{", ostream);
|
---|
1286 | pending = 1;
|
---|
1287 | }
|
---|
1288 | else
|
---|
1289 | fputs ("}\n", ostream);
|
---|
1290 |
|
---|
1291 | /* Record name of most recent primary. */
|
---|
1292 | if (lastprimarylength < primarylength)
|
---|
1293 | {
|
---|
1294 | lastprimarylength = primarylength + 100;
|
---|
1295 | lastprimary = (char *) xrealloc (lastprimary,
|
---|
1296 | 1 + lastprimarylength);
|
---|
1297 | }
|
---|
1298 | strncpy (lastprimary, primary, primarylength);
|
---|
1299 | lastprimary[primarylength] = 0;
|
---|
1300 |
|
---|
1301 | /* There is no current secondary within this primary, now. */
|
---|
1302 | lastsecondary[0] = 0;
|
---|
1303 | }
|
---|
1304 |
|
---|
1305 | /* Should not have an entry with no subtopic following one with a
|
---|
1306 | subtopic. */
|
---|
1307 |
|
---|
1308 | if (nosecondary && *lastsecondary)
|
---|
1309 | error (_("entry %s follows an entry with a secondary name"), line);
|
---|
1310 |
|
---|
1311 | /* Start a new secondary entry if necessary. */
|
---|
1312 | if (!nosecondary && strncmp (secondary, lastsecondary, secondarylength))
|
---|
1313 | {
|
---|
1314 | if (pending)
|
---|
1315 | {
|
---|
1316 | fputs ("}\n", ostream);
|
---|
1317 | pending = 0;
|
---|
1318 | }
|
---|
1319 |
|
---|
1320 | /* Write the entry for the secondary. */
|
---|
1321 | fputs ("\\secondary {", ostream);
|
---|
1322 | fwrite (secondary, secondarylength, 1, ostream);
|
---|
1323 | fputs ("}{", ostream);
|
---|
1324 | pending = 1;
|
---|
1325 |
|
---|
1326 | /* Record name of most recent secondary. */
|
---|
1327 | if (lastsecondarylength < secondarylength)
|
---|
1328 | {
|
---|
1329 | lastsecondarylength = secondarylength + 100;
|
---|
1330 | lastsecondary = (char *) xrealloc (lastsecondary,
|
---|
1331 | 1 + lastsecondarylength);
|
---|
1332 | }
|
---|
1333 | strncpy (lastsecondary, secondary, secondarylength);
|
---|
1334 | lastsecondary[secondarylength] = 0;
|
---|
1335 | }
|
---|
1336 |
|
---|
1337 | /* Here to add one more page number to the current entry. */
|
---|
1338 | if (pending++ != 1)
|
---|
1339 | fputs (", ", ostream); /* Punctuate first, if this is not the first. */
|
---|
1340 | fwrite (pagenumber, pagelength, 1, ostream);
|
---|
1341 | }
|
---|
1342 |
|
---|
1343 | /* Close out any unfinished output entry. */
|
---|
1344 |
|
---|
1345 | void
|
---|
1346 | finish_index (FILE *ostream)
|
---|
1347 | {
|
---|
1348 | if (pending)
|
---|
1349 | fputs ("}\n", ostream);
|
---|
1350 | free (lastprimary);
|
---|
1351 | free (lastsecondary);
|
---|
1352 | }
|
---|
1353 | |
---|
1354 |
|
---|
1355 | /* Copy the lines in the sorted order.
|
---|
1356 | Each line is copied out of the input file it was found in. */
|
---|
1357 |
|
---|
1358 | void
|
---|
1359 | writelines (char **linearray, int nlines, FILE *ostream)
|
---|
1360 | {
|
---|
1361 | char **stop_line = linearray + nlines;
|
---|
1362 | char **next_line;
|
---|
1363 |
|
---|
1364 | init_index ();
|
---|
1365 |
|
---|
1366 | /* Output the text of the lines, and free the buffer space. */
|
---|
1367 |
|
---|
1368 | for (next_line = linearray; next_line != stop_line; next_line++)
|
---|
1369 | {
|
---|
1370 | /* If -u was specified, output the line only if distinct from
|
---|
1371 | previous one. */
|
---|
1372 | if (next_line == linearray
|
---|
1373 | /* Compare previous line with this one, using only the
|
---|
1374 | explicitly specd keyfields. */
|
---|
1375 | || compare_general (*(next_line - 1), *next_line, 0L, 0L,
|
---|
1376 | num_keyfields - 1))
|
---|
1377 | {
|
---|
1378 | char *p = *next_line;
|
---|
1379 | char c;
|
---|
1380 |
|
---|
1381 | while ((c = *p++) && c != '\n')
|
---|
1382 | /* Do nothing. */ ;
|
---|
1383 | *(p - 1) = 0;
|
---|
1384 | indexify (*next_line, ostream);
|
---|
1385 | }
|
---|
1386 | }
|
---|
1387 |
|
---|
1388 | finish_index (ostream);
|
---|
1389 | }
|
---|
1390 | |
---|
1391 |
|
---|
1392 | /* Assume (and optionally verify) that each input file is sorted;
|
---|
1393 | merge them and output the result.
|
---|
1394 | Returns nonzero if any input file fails to be sorted.
|
---|
1395 |
|
---|
1396 | This is the high-level interface that can handle an unlimited
|
---|
1397 | number of files. */
|
---|
1398 |
|
---|
1399 | #define MAX_DIRECT_MERGE 10
|
---|
1400 |
|
---|
1401 | int
|
---|
1402 | merge_files (char **infiles, int nfiles, char *outfile)
|
---|
1403 | {
|
---|
1404 | char **tempfiles;
|
---|
1405 | int ntemps;
|
---|
1406 | int i;
|
---|
1407 | int value = 0;
|
---|
1408 | int start_tempcount = tempcount;
|
---|
1409 |
|
---|
1410 | if (nfiles <= MAX_DIRECT_MERGE)
|
---|
1411 | return merge_direct (infiles, nfiles, outfile);
|
---|
1412 |
|
---|
1413 | /* Merge groups of MAX_DIRECT_MERGE input files at a time,
|
---|
1414 | making a temporary file to hold each group's result. */
|
---|
1415 |
|
---|
1416 | ntemps = (nfiles + MAX_DIRECT_MERGE - 1) / MAX_DIRECT_MERGE;
|
---|
1417 | tempfiles = (char **) xmalloc (ntemps * sizeof (char *));
|
---|
1418 | for (i = 0; i < ntemps; i++)
|
---|
1419 | {
|
---|
1420 | int nf = MAX_DIRECT_MERGE;
|
---|
1421 | if (i + 1 == ntemps)
|
---|
1422 | nf = nfiles - i * MAX_DIRECT_MERGE;
|
---|
1423 | tempfiles[i] = maketempname (++tempcount);
|
---|
1424 | value |= merge_direct (&infiles[i * MAX_DIRECT_MERGE], nf, tempfiles[i]);
|
---|
1425 | }
|
---|
1426 |
|
---|
1427 | /* All temporary files that existed before are no longer needed
|
---|
1428 | since their contents have been merged into our new tempfiles.
|
---|
1429 | So delete them. */
|
---|
1430 | flush_tempfiles (start_tempcount);
|
---|
1431 |
|
---|
1432 | /* Now merge the temporary files we created. */
|
---|
1433 |
|
---|
1434 | merge_files (tempfiles, ntemps, outfile);
|
---|
1435 |
|
---|
1436 | free (tempfiles);
|
---|
1437 |
|
---|
1438 | return value;
|
---|
1439 | }
|
---|
1440 | |
---|
1441 |
|
---|
1442 | /* Assume (and optionally verify) that each input file is sorted;
|
---|
1443 | merge them and output the result.
|
---|
1444 | Returns nonzero if any input file fails to be sorted.
|
---|
1445 |
|
---|
1446 | This version of merging will not work if the number of
|
---|
1447 | input files gets too high. Higher level functions
|
---|
1448 | use it only with a bounded number of input files. */
|
---|
1449 |
|
---|
1450 | int
|
---|
1451 | merge_direct (char **infiles, int nfiles, char *outfile)
|
---|
1452 | {
|
---|
1453 | struct linebuffer *lb1, *lb2;
|
---|
1454 | struct linebuffer **thisline, **prevline;
|
---|
1455 | FILE **streams;
|
---|
1456 | int i;
|
---|
1457 | int nleft;
|
---|
1458 | int lossage = 0;
|
---|
1459 | int *file_lossage;
|
---|
1460 | struct linebuffer *prev_out = 0;
|
---|
1461 | FILE *ostream = stdout;
|
---|
1462 |
|
---|
1463 | if (outfile)
|
---|
1464 | {
|
---|
1465 | ostream = fopen (outfile, "w");
|
---|
1466 | }
|
---|
1467 | if (!ostream)
|
---|
1468 | pfatal_with_name (outfile);
|
---|
1469 |
|
---|
1470 | init_index ();
|
---|
1471 |
|
---|
1472 | if (nfiles == 0)
|
---|
1473 | {
|
---|
1474 | if (outfile)
|
---|
1475 | fclose (ostream);
|
---|
1476 | return 0;
|
---|
1477 | }
|
---|
1478 |
|
---|
1479 | /* For each file, make two line buffers. Also, for each file, there
|
---|
1480 | is an element of `thisline' which points at any time to one of the
|
---|
1481 | file's two buffers, and an element of `prevline' which points to
|
---|
1482 | the other buffer. `thisline' is supposed to point to the next
|
---|
1483 | available line from the file, while `prevline' holds the last file
|
---|
1484 | line used, which is remembered so that we can verify that the file
|
---|
1485 | is properly sorted. */
|
---|
1486 |
|
---|
1487 | /* lb1 and lb2 contain one buffer each per file. */
|
---|
1488 | lb1 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer));
|
---|
1489 | lb2 = (struct linebuffer *) xmalloc (nfiles * sizeof (struct linebuffer));
|
---|
1490 |
|
---|
1491 | /* thisline[i] points to the linebuffer holding the next available
|
---|
1492 | line in file i, or is zero if there are no lines left in that file. */
|
---|
1493 | thisline = (struct linebuffer **)
|
---|
1494 | xmalloc (nfiles * sizeof (struct linebuffer *));
|
---|
1495 | /* prevline[i] points to the linebuffer holding the last used line
|
---|
1496 | from file i. This is just for verifying that file i is properly
|
---|
1497 | sorted. */
|
---|
1498 | prevline = (struct linebuffer **)
|
---|
1499 | xmalloc (nfiles * sizeof (struct linebuffer *));
|
---|
1500 | /* streams[i] holds the input stream for file i. */
|
---|
1501 | streams = (FILE **) xmalloc (nfiles * sizeof (FILE *));
|
---|
1502 | /* file_lossage[i] is nonzero if we already know file i is not
|
---|
1503 | properly sorted. */
|
---|
1504 | file_lossage = (int *) xmalloc (nfiles * sizeof (int));
|
---|
1505 |
|
---|
1506 | /* Allocate and initialize all that storage. */
|
---|
1507 |
|
---|
1508 | for (i = 0; i < nfiles; i++)
|
---|
1509 | {
|
---|
1510 | initbuffer (&lb1[i]);
|
---|
1511 | initbuffer (&lb2[i]);
|
---|
1512 | thisline[i] = &lb1[i];
|
---|
1513 | prevline[i] = &lb2[i];
|
---|
1514 | file_lossage[i] = 0;
|
---|
1515 | streams[i] = fopen (infiles[i], "r");
|
---|
1516 | if (!streams[i])
|
---|
1517 | pfatal_with_name (infiles[i]);
|
---|
1518 |
|
---|
1519 | readline (thisline[i], streams[i]);
|
---|
1520 | }
|
---|
1521 |
|
---|
1522 | /* Keep count of number of files not at eof. */
|
---|
1523 | nleft = nfiles;
|
---|
1524 |
|
---|
1525 | while (nleft)
|
---|
1526 | {
|
---|
1527 | struct linebuffer *best = 0;
|
---|
1528 | struct linebuffer *exch;
|
---|
1529 | int bestfile = -1;
|
---|
1530 | int i;
|
---|
1531 |
|
---|
1532 | /* Look at the next avail line of each file; choose the least one. */
|
---|
1533 |
|
---|
1534 | for (i = 0; i < nfiles; i++)
|
---|
1535 | {
|
---|
1536 | if (thisline[i] &&
|
---|
1537 | (!best ||
|
---|
1538 | 0 < compare_general (best->buffer, thisline[i]->buffer,
|
---|
1539 | (long) bestfile, (long) i, num_keyfields)))
|
---|
1540 | {
|
---|
1541 | best = thisline[i];
|
---|
1542 | bestfile = i;
|
---|
1543 | }
|
---|
1544 | }
|
---|
1545 |
|
---|
1546 | /* Output that line, unless it matches the previous one and we
|
---|
1547 | don't want duplicates. */
|
---|
1548 |
|
---|
1549 | if (!(prev_out &&
|
---|
1550 | !compare_general (prev_out->buffer,
|
---|
1551 | best->buffer, 0L, 1L, num_keyfields - 1)))
|
---|
1552 | indexify (best->buffer, ostream);
|
---|
1553 | prev_out = best;
|
---|
1554 |
|
---|
1555 | /* Now make the line the previous of its file, and fetch a new
|
---|
1556 | line from that file. */
|
---|
1557 |
|
---|
1558 | exch = prevline[bestfile];
|
---|
1559 | prevline[bestfile] = thisline[bestfile];
|
---|
1560 | thisline[bestfile] = exch;
|
---|
1561 |
|
---|
1562 | while (1)
|
---|
1563 | {
|
---|
1564 | /* If the file has no more, mark it empty. */
|
---|
1565 |
|
---|
1566 | if (feof (streams[bestfile]))
|
---|
1567 | {
|
---|
1568 | thisline[bestfile] = 0;
|
---|
1569 | /* Update the number of files still not empty. */
|
---|
1570 | nleft--;
|
---|
1571 | break;
|
---|
1572 | }
|
---|
1573 | readline (thisline[bestfile], streams[bestfile]);
|
---|
1574 | if (thisline[bestfile]->buffer[0] || !feof (streams[bestfile]))
|
---|
1575 | break;
|
---|
1576 | }
|
---|
1577 | }
|
---|
1578 |
|
---|
1579 | finish_index (ostream);
|
---|
1580 |
|
---|
1581 | /* Free all storage and close all input streams. */
|
---|
1582 |
|
---|
1583 | for (i = 0; i < nfiles; i++)
|
---|
1584 | {
|
---|
1585 | fclose (streams[i]);
|
---|
1586 | free (lb1[i].buffer);
|
---|
1587 | free (lb2[i].buffer);
|
---|
1588 | }
|
---|
1589 | free (file_lossage);
|
---|
1590 | free (lb1);
|
---|
1591 | free (lb2);
|
---|
1592 | free (thisline);
|
---|
1593 | free (prevline);
|
---|
1594 | free (streams);
|
---|
1595 |
|
---|
1596 | if (outfile)
|
---|
1597 | fclose (ostream);
|
---|
1598 |
|
---|
1599 | return lossage;
|
---|
1600 | }
|
---|
1601 | |
---|
1602 |
|
---|
1603 | /* Print error message and exit. */
|
---|
1604 |
|
---|
1605 | void
|
---|
1606 | fatal (const char *format, const char *arg)
|
---|
1607 | {
|
---|
1608 | error (format, arg);
|
---|
1609 | xexit (1);
|
---|
1610 | }
|
---|
1611 |
|
---|
1612 | /* Print error message. FORMAT is printf control string, ARG is arg for it. */
|
---|
1613 | void
|
---|
1614 | error (const char *format, const char *arg)
|
---|
1615 | {
|
---|
1616 | printf ("%s: ", program_name);
|
---|
1617 | printf (format, arg);
|
---|
1618 | if (format[strlen (format) -1] != '\n')
|
---|
1619 | printf ("\n");
|
---|
1620 | }
|
---|
1621 |
|
---|
1622 | void
|
---|
1623 | perror_with_name (const char *name)
|
---|
1624 | {
|
---|
1625 | fprintf (stderr, "%s: ", program_name);
|
---|
1626 | perror (name);
|
---|
1627 | }
|
---|
1628 |
|
---|
1629 | void
|
---|
1630 | pfatal_with_name (const char *name)
|
---|
1631 | {
|
---|
1632 | perror_with_name (name);
|
---|
1633 | xexit (1);
|
---|
1634 | }
|
---|
1635 |
|
---|
1636 | |
---|
1637 |
|
---|
1638 | /* Return a newly-allocated string concatenating S1 and S2. */
|
---|
1639 |
|
---|
1640 | char *
|
---|
1641 | concat (char *s1, char *s2)
|
---|
1642 | {
|
---|
1643 | int len1 = strlen (s1), len2 = strlen (s2);
|
---|
1644 | char *result = (char *) xmalloc (len1 + len2 + 1);
|
---|
1645 |
|
---|
1646 | strcpy (result, s1);
|
---|
1647 | strcpy (result + len1, s2);
|
---|
1648 | *(result + len1 + len2) = 0;
|
---|
1649 |
|
---|
1650 | return result;
|
---|
1651 | }
|
---|