source: trunk/libdjvu/DjVmDir.h @ 209

Last change on this file since 209 was 206, checked in by Eugene Romanenko, 14 years ago

DJVU plugin: djvulibre updated to version 3.5.19

File size: 19.3 KB
Line 
1//C-  -*- C++ -*-
2//C- -------------------------------------------------------------------
3//C- DjVuLibre-3.5
4//C- Copyright (c) 2002  Leon Bottou and Yann Le Cun.
5//C- Copyright (c) 2001  AT&T
6//C-
7//C- This software is subject to, and may be distributed under, the
8//C- GNU General Public License, either Version 2 of the license,
9//C- or (at your option) any later version. The license should have
10//C- accompanied the software or you may obtain a copy of the license
11//C- from the Free Software Foundation at http://www.fsf.org .
12//C-
13//C- This program is distributed in the hope that it will be useful,
14//C- but WITHOUT ANY WARRANTY; without even the implied warranty of
15//C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16//C- GNU General Public License for more details.
17//C-
18//C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library from
19//C- Lizardtech Software.  Lizardtech Software has authorized us to
20//C- replace the original DjVu(r) Reference Library notice by the following
21//C- text (see doc/lizard2002.djvu and doc/lizardtech2007.djvu):
22//C-
23//C-  ------------------------------------------------------------------
24//C- | DjVu (r) Reference Library (v. 3.5)
25//C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved.
26//C- | The DjVu Reference Library is protected by U.S. Pat. No.
27//C- | 6,058,214 and patents pending.
28//C- |
29//C- | This software is subject to, and may be distributed under, the
30//C- | GNU General Public License, either Version 2 of the license,
31//C- | or (at your option) any later version. The license should have
32//C- | accompanied the software or you may obtain a copy of the license
33//C- | from the Free Software Foundation at http://www.fsf.org .
34//C- |
35//C- | The computer code originally released by LizardTech under this
36//C- | license and unmodified by other parties is deemed "the LIZARDTECH
37//C- | ORIGINAL CODE."  Subject to any third party intellectual property
38//C- | claims, LizardTech grants recipient a worldwide, royalty-free,
39//C- | non-exclusive license to make, use, sell, or otherwise dispose of
40//C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the
41//C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU
42//C- | General Public License.   This grant only confers the right to
43//C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to
44//C- | the extent such infringement is reasonably necessary to enable
45//C- | recipient to make, have made, practice, sell, or otherwise dispose
46//C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to
47//C- | any greater extent that may be necessary to utilize further
48//C- | modifications or combinations.
49//C- |
50//C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY
51//C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
52//C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF
53//C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
54//C- +------------------------------------------------------------------
55//
56// $Id: DjVmDir.h,v 1.12 2007/03/25 20:48:29 leonb Exp $
57// $Name: release_3_5_19 $
58
59#ifndef _DJVMDIR_H
60#define _DJVMDIR_H
61#ifdef HAVE_CONFIG_H
62#include "config.h"
63#endif
64#if NEED_GNUG_PRAGMAS
65# pragma interface
66#endif
67
68
69/** @name DjVmDir.h
70    Files #"DjVmDir.h"# and #"DjVmDir.cpp"# implement class \Ref{DjVmDir} for
71    representing the directory of a DjVu multipage document.
72
73    {\bf Bundled vs. Indirect format} --- There are currently two multipage
74    DjVu formats supported: {\em bundled} and {\em indirect}.  In the first
75    format all component files composing a given document are packaged (or
76    bundled) into one file, in the second one every page and component is
77    stored in a separate file and there is one more file, which contains the
78    list of all others.
79
80    {\bf Multipage DjVu format} --- Multipage DjVu documents follow the EA
81    IFF85 format (cf. \Ref{IFFByteStream.h}.)  A document is composed of a
82    #"FORM:DJVM"# whose first chunk is a #"DIRM"# chunk containing the {\em
83    document directory}.  This directory lists all component files composing
84    the given document, helps to access every component file and identify the
85    pages of the document.
86    \begin{itemize}
87    \item In a {\em bundled} multipage file, the component files
88         are stored immediately after the #"DIRM"# chunk,
89         within the #"FORM:DJVU"# composite chunk. 
90    \item In an {\em indirect} multipage file, the component files are
91          stored in different files whose URLs are composed using information
92          stored in the #"DIRM"# chunk.
93    \end{itemize}
94    Most of the component files represent pages of a document.  Some files
95    however represent data shared by several pages.  The pages refer to these
96    supporting files by means of an inclusion chunk (#"INCL"# chunks)
97    identifying the supporting file.
98
99    {\bf Document Directory} --- Every directory record describes a component
100    file.  Each component file is identified by a small string named the
101    identifier (ID).  Each component file also contains a file name and a
102    title.  The format of the #"DIRM"# chunk is described in section
103    \Ref{Format of the DIRM chunk.}.
104
105    Theoretically, IDs are used to uniquely identify each component file in
106    #"INCL"# chunks, names are used to compose the the URLs of the component
107    files in an indirect multipage DjVu file, and titles are cosmetic names
108    possibly displayed when viewing a page of a document.  There are however
109    many problems with this scheme, and we {\em strongly suggest}, with the
110    current implementation to always make the file ID, the file name and the
111    file title identical.
112
113    @memo Implements DjVu multipage document directory
114    @author Andrei Erofeev <eaf@geocities.com>
115    @version
116    #$Id: DjVmDir.h,v 1.12 2007/03/25 20:48:29 leonb Exp $# */
117//@{
118
119
120
121#include "GString.h"
122#include "GThreads.h"
123
124#ifdef HAVE_NAMESPACES
125namespace DJVU {
126# ifdef NOT_DEFINED // Just to fool emacs c++ mode
127}
128#endif
129#endif
130
131class ByteStream;
132
133/** Implements DjVu multipage document directory.  There are currently two
134    multipage DjVu formats supported: {\em bundled} and {\em indirect}.  In
135    the first format all component files composing a given document are
136    packaged (or bundled) into one file, in the second one every page and
137    component is stored in a separate file and there is one more file, which
138    contains the list of all others.
139
140    The multipage document directory lists all component files composing the
141    given document, helps to access every file, identify pages and maintain
142    user-specified shortcuts.  Every directory record describes a file
143    composing the document.  Each file is identified by a small string named
144    the identifier (ID).  Each file may also contain a file name and a title.
145
146    The #DjVmDir# class represents a multipage document directory.  Its main
147    purpose is to encode and decode the document directory when writing or
148    reading the #DIRM# chunk.  Normally you don't have to create this class
149    yourself. It's done automatically when \Ref{DjVmDoc} class initializes
150    itself. It may be useful though to be able to access records in the
151    directory because some classes (like \Ref{DjVuDocument} and \Ref{DjVmDoc})
152    return a pointer to #DjVmDir# in some cases. */
153
154class DjVmDir : public GPEnabled
155{
156protected:
157      /** Class \Ref{DjVmDir::File} represents the directory records
158          managed by class \Ref{DjVmDir}. */
159   DjVmDir(void) { } ;
160public:
161   class File;
162
163   static const int version;
164
165      /** Class \Ref{DjVmDir::File} represents the directory records
166          managed by class \Ref{DjVmDir}. */
167   static GP<DjVmDir> create(void) {return new DjVmDir; } ;
168
169      /** Decodes the directory from the specified stream. */
170   void decode(const GP<ByteStream> &stream);
171      /** Encodes the directory into the specified stream. */
172   void encode(const GP<ByteStream> &stream, const bool do_rename=false) const;
173      /** Encodes the directory into the specified stream,
174          explicitely as bundled or indirect. */
175  void encode(const GP<ByteStream> &stream, 
176              const bool bundled, const bool do_rename) const;
177      /** Tests if directory defines an {\em indirect} document. */
178   bool is_indirect(void) const;
179      /** Tests if the directory defines a {\em bundled} document. */
180   bool is_bundled(void) const;
181      /** Translates page numbers to file records. */
182   GP<File> page_to_file(int page_num) const;
183      /** Translates file names to file records. */
184   GP<File> name_to_file(const GUTF8String & name) const;
185      /** Translates file IDs to file records. */
186   GP<File> id_to_file(const GUTF8String &id) const;
187      /** Translates file shortcuts to file records. */
188   GP<File> title_to_file(const GUTF8String &title) const;
189      /** Access file record by position. */
190   GP<File> pos_to_file(int fileno, int *ppageno=0) const;
191      /** Returns position of the file in the directory. */
192   int get_file_pos(const File * f) const;
193      /** Returns position of the given page in the directory. */
194   int get_page_pos(int page_num) const;
195      /** Check for duplicate names, and resolve them. */
196   GPList<File> resolve_duplicates(const bool save_as_bundled);
197      /** Returns a copy of the list of file records. */
198   GPList<File> get_files_list(void) const;
199      /** Returns the number of file records. */
200   int get_files_num(void) const;
201      /** Returns the number of file records representing pages. */
202   int get_pages_num(void) const;
203      /** Returns back pointer to the file with #SHARED_ANNO# flag.
204        Note that there may be only one file with shared annotations
205        in any multipage DjVu document. */
206   GP<File> get_shared_anno_file(void) const;
207      /** Changes the title of the file with ID #id#. */
208   void set_file_title(const GUTF8String &id, const GUTF8String &title);
209      /** Changes the name of the file with ID #id#. */
210   void set_file_name(const GUTF8String &id, const GUTF8String &name);
211      /** Inserts the specified file record at the specified position.
212        Specifying #pos# equal to #-1# means to append.  The actual position
213        inserted is returned. */
214   int insert_file(const GP<File> & file, int pos=-1);
215      /** Removes a file record with ID #id#. */
216   void delete_file(const GUTF8String &id);
217private:
218   GCriticalSection class_lock;
219   GPList<File> files_list;
220   GPArray<File> page2file;
221   GPMap<GUTF8String, File> name2file;
222   GPMap<GUTF8String, File> id2file;
223   GPMap<GUTF8String, File> title2file;
224private: //dummy stuff
225   static void decode(ByteStream *);
226   static void encode(ByteStream *);
227};
228
229class DjVmDir::File : public GPEnabled
230{
231public:
232  // Out of the record: INCLUDE below must be zero and PAGE must be one.
233  // This is to avoid problems with the File constructor, which now takes
234  // 'int file_type' as the last argument instead of 'bool is_page'
235 
236  /** File type. Possible file types are:
237     \begin{description}
238       \item[PAGE] This is a top level page file. It may include other
239         #INCLUDE#d files, which may in turn be shared between
240         different pages.
241       \item[INCLUDE] This file is included into some other file inside
242         this document.
243       \item[THUMBNAILS] This file contains thumbnails for the document
244         pages.
245       \item[SHARED_ANNO] This file contains annotations shared by
246         all the pages. It's supposed to be included into every page
247         for the annotations to take effect. There may be only one
248         file with shared annotations in a document.
249     \end{description} */
250  enum FILE_TYPE { INCLUDE=0, PAGE=1, THUMBNAILS=2, SHARED_ANNO=3 };
251protected:
252  /** Default constructor. */
253  File(void);
254
255public:
256  static GP<File> create(void) { return new File(); }
257  static GP<File> create(const GUTF8String &load_name,
258     const GUTF8String &save_name, const GUTF8String &title,
259     const FILE_TYPE file_type);
260
261  /** Check for filenames that are not valid for the native encoding,
262      and change them. */
263  const GUTF8String &check_save_name(const bool as_bundled);
264
265  /** File name.  The optional file name must be unique and is the name
266      that will be used when the document is saved to an indirect file.
267      If not assigned, the value of #id# will be used for this purpose.
268      By keeping the name in {\em bundled} document we guarantee, that it
269      can be expanded later into {\em indirect} document and files will
270      still have the same names, if the name is legal on a given filesystem.
271    */
272  const GUTF8String &get_save_name(void) const;
273
274  /** File identifier.  The encoder assigns a unique identifier to each file
275      in a multipage document. This is the name used when loading files.
276      Indirection chunks in other files (#"INCL"# chunks) may refer to another
277      file using its identifier. */
278  const GUTF8String &get_load_name(void) const;
279  void set_load_name(const GUTF8String &id);
280
281  /** File title.  The file title is assigned by the user and may be used as
282      a shortcut for viewing a particular page.  Names like #"chapter1"# or
283      #"appendix"# are appropriate. */
284  const GUTF8String &get_title() const;
285  void set_title(const GUTF8String &id);
286
287  /** Reports an ascii string indicating file type. */
288  GUTF8String get_str_type(void) const;
289
290  /** Offset of the file data in a bundled DJVM file.  This number is
291      relevant in the {\em bundled} case only when everything is packed into
292      one single file. */
293  int offset;
294
295  /** Size of the file data in a bundled DJVM file.  This number is
296      relevant in the {\em bundled} case only when everything is
297      packed into one single file. */
298  int size;
299
300  /** Have we checked the saved file name, to see if it is valid on the
301      local disk? */
302  bool valid_name;
303
304  /** Tests if this file represents a page of the document. */
305  bool is_page(void) const 
306  {
307    return (flags & TYPE_MASK)==PAGE;
308  }
309
310  /** Returns #TRUE# if this file is included into some other files of
311      this document. */
312  bool is_include(void) const
313  {
314    return (flags & TYPE_MASK)==INCLUDE;
315  }
316
317  /** Returns #TRUE# if this file contains thumbnails for the document pages. */
318  bool is_thumbnails(void) const
319  {
320    return (flags & TYPE_MASK)==THUMBNAILS;
321  }
322
323  /** Returns the page number of this file. This function returns
324      #-1# if this file does not represent a page of the document. */
325  bool is_shared_anno(void) const
326  { return (flags & TYPE_MASK)==SHARED_ANNO; }
327
328  int get_page_num(void) const 
329  { return page_num; } 
330protected:
331  GUTF8String name;
332  GUTF8String oldname;
333  GUTF8String id;
334  GUTF8String title; 
335  void set_save_name(const GUTF8String &name);
336private:
337      friend class DjVmDir;
338      enum FLAGS_0 { IS_PAGE_0=1, HAS_NAME_0=2, HAS_TITLE_0=4 };
339      enum FLAGS_1 { HAS_NAME=0x80, HAS_TITLE=0x40, TYPE_MASK=0x3f };
340      unsigned char flags;
341      int page_num;
342};
343
344inline const GUTF8String &
345DjVmDir::File::get_load_name(void) const
346{ return id; }
347
348inline const GUTF8String &
349DjVmDir::File::get_title() const
350{ return *(title.length()?&title:&id); }
351
352inline void
353DjVmDir::File::set_title(const GUTF8String &xtitle) { title=xtitle; }
354
355/** @name Format of the DIRM chunk.
356
357    {\bf Variants} --- There are two versions of the #"DIRM"# chunk format.
358    The version number is identified by the seven low bits of the first byte
359    of the chunk.  Version {\bf 0} is obsolete and should never be used.  This
360    section describes version {\bf 1}.  There are two major multipage DjVu
361    formats supported: {\em bundled} and {\em indirect}.  The #"DIRM"# chunk
362    indicates which format is used in the most significant bit of the first
363    byte of the chunk.  The document is bundled when this bit is set.
364    Otherwise the document is indirect.
365
366    {\bf Unencoded data} --- The #"DIRM"# chunk is composed some unencoded
367    data followed by \Ref{bzz} encoded data.  The unencoded data starts with
368    the version byte and a 16 bit integer representing the number of component
369    files.  All integers are encoded with the most significant byte first.
370    \begin{verbatim}
371          BYTE:             Flags/Version:  0x<bundled>0000011
372          INT16:            Number of component files.
373    \end{verbatim}
374    When the document is a bundled document (i.e. the flag #bundled# is set),
375    this header is followed by the offsets of each of the component files within
376    the #"FORM:DJVM"#.  These offsets allow for random component file access.
377    \begin{verbatim}
378          INT32:            Offset of first component file.
379          INT32:            Offset of second component file.
380          ...
381          INT32:            Offset of last component file.
382    \end{verbatim}
383
384    {\bf BZZ encoded data} --- The rest of the chunk is entirely compressed
385    with the BZZ general purpose compressor.  We describe now the data fed
386    into (or retrieved from) the BZZ codec (cf. \Ref{BSByteStream}.)  First
387    come the sizes and the flags associated with each component file.
388    \begin{verbatim}
389          INT24:             Size of the first component file.
390          INT24:             Size of the second component file.
391          ...
392          INT24:             Size of the last component file.
393          BYTE:              Flag byte for the first component file.
394          BYTE:              Flag byte for the second component file.
395          ...
396          BYTE:              Flag byte for the last component file.
397    \end{verbatim}
398    The flag bytes have the following format:
399    \begin{verbatim}
400          0b<hasname><hastitle>000000     for a file included by other files.
401          0b<hasname><hastitle>000001     for a file representing a page.
402          0b<hasname><hastitle>000010     for a file containing thumbnails.
403    \end{verbatim}
404    Flag #hasname# is set when the name of the file is different from the file
405    ID.  Flag #hastitle# is set when the title of the file is different from
406    the file ID.  These flags are used to avoid encoding the same string three
407    times.  Then come a sequence of zero terminated strings.  There are one to
408    three such strings per component file.  The first string contains the ID
409    of the component file.  The second string contains the name of the
410    component file.  It is only present when the flag #hasname# is set. The third
411    one contains the title of the component file. It is only present when the
412    flag #hastitle# is set. The \Ref{bzz} encoding system makes sure that
413    all these strings will be encoded efficiently despite their possible
414    redundancies.
415    \begin{verbatim}
416          ZSTR:     ID of the first component file.
417          ZSTR:     Name of the first component file (only if #hasname# is set.)
418          ZSTR:     Title of the first component file (only if #hastitle# is set.)
419          ...
420          ZSTR:     ID of the last component file.
421          ZSTR:     Name of the last component file (only if #hasname# is set.)
422          ZSTR:     Title of the last component file (only if #hastitle# is set.)
423    \end{verbatim}
424
425    @memo Description of the format of the DIRM chunk.  */
426//@}
427
428
429
430// -------------- IMPLEMENTATION
431
432
433inline bool
434DjVmDir::is_bundled(void) const
435{
436  return ! is_indirect();
437}
438
439inline bool
440DjVmDir::is_indirect(void) const
441{
442  GCriticalSectionLock lock((GCriticalSection *) &class_lock);
443  return ( files_list.size() && files_list[files_list] != 0 &&
444           files_list[files_list]->offset==0 );
445}
446
447
448
449// ----- THE END
450
451#ifdef HAVE_NAMESPACES
452}
453# ifndef NOT_USING_DJVU_NAMESPACE
454using namespace DJVU;
455# endif
456#endif
457#endif
Note: See TracBrowser for help on using the repository browser.