source: trunk/libdjvu/DataPool.h @ 136

Last change on this file since 136 was 17, checked in by Eugene Romanenko, 16 years ago

update makefiles, remove absolute paths, update djvulibre to version 3.5.17

File size: 26.8 KB
1//C-  -*- C++ -*-
2//C- -------------------------------------------------------------------
3//C- DjVuLibre-3.5
4//C- Copyright (c) 2002  Leon Bottou and Yann Le Cun.
5//C- Copyright (c) 2001  AT&T
7//C- This software is subject to, and may be distributed under, the
8//C- GNU General Public License, Version 2. The license should have
9//C- accompanied the software or you may obtain a copy of the license
10//C- from the Free Software Foundation at .
12//C- This program is distributed in the hope that it will be useful,
13//C- but WITHOUT ANY WARRANTY; without even the implied warranty of
15//C- GNU General Public License for more details.
17//C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library
18//C- distributed by Lizardtech Software.  On July 19th 2002, Lizardtech
19//C- Software authorized us to replace the original DjVu(r) Reference
20//C- Library notice by the following text (see doc/lizard2002.djvu):
22//C-  ------------------------------------------------------------------
23//C- | DjVu (r) Reference Library (v. 3.5)
24//C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved.
25//C- | The DjVu Reference Library is protected by U.S. Pat. No.
26//C- | 6,058,214 and patents pending.
27//C- |
28//C- | This software is subject to, and may be distributed under, the
29//C- | GNU General Public License, Version 2. The license should have
30//C- | accompanied the software or you may obtain a copy of the license
31//C- | from the Free Software Foundation at .
32//C- |
33//C- | The computer code originally released by LizardTech under this
34//C- | license and unmodified by other parties is deemed "the LIZARDTECH
35//C- | ORIGINAL CODE."  Subject to any third party intellectual property
36//C- | claims, LizardTech grants recipient a worldwide, royalty-free,
37//C- | non-exclusive license to make, use, sell, or otherwise dispose of
38//C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the
39//C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU
40//C- | General Public License.   This grant only confers the right to
41//C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to
42//C- | the extent such infringement is reasonably necessary to enable
43//C- | recipient to make, have made, practice, sell, or otherwise dispose
44//C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to
45//C- | any greater extent that may be necessary to utilize further
46//C- | modifications or combinations.
47//C- |
52//C- +------------------------------------------------------------------
54// $Id: DataPool.h,v 1.10 2003/11/07 22:08:20 leonb Exp $
55// $Name:  $
57#ifndef _DATAPOOL_H
58#define _DATAPOOL_H
59#ifdef HAVE_CONFIG_H
60#include "config.h"
63# pragma interface
67#include "GThreads.h"
68#include "GString.h"
69#include "GURL.h"
72namespace DJVU {
73# ifdef NOT_DEFINED // Just to fool emacs c++ mode
78class ByteStream;
80/** @name DataPool.h
81    Files #"DataPool.h"# and #"DataPool.cpp"# implement classes \Ref{DataPool}
82    and \Ref{DataRange} used by DjVu decoder to access data.
84    The main goal of class \Ref{DataPool} is to provide concurrent access
85    to the same data from many threads with a possibility to add data
86    from yet another thread. It is especially important in the case of the
87    Netscape plugin when data is not immediately available, but decoding
88    should be started as soon as possible. In this situation it is vital
89    to provide transparent access to the data from many threads possibly
90    blocking readers that try to access information that has not been
91    received yet.
93    When the data is local though, it can be accessed directly using
94    standard IO mechanism. To provide a uniform interface for decoding
95    routines, \Ref{DataPool} supports file mode as well.
97    @memo Thread safe data storage
98    @author Andrei Erofeev <>
99    @version #$Id: DataPool.h,v 1.10 2003/11/07 22:08:20 leonb Exp $#
104/** Thread safe data storage.
105    The purpose of #DataPool# is to provide a uniform interface for
106    accessing data from decoding routines running in a multi-threaded
107    environment. Depending on the mode of operation it may contain the
108    actual data, may be connected to another #DataPool# or may be mapped
109    to a file. Regardless of the mode, the class returns data in a
110    thread-safe way, blocking reading threads if there is no data of
111    interest available. This blocking is especially useful in the
112    networking environment (plugin) when there is a running decoding thread,
113    which wants to start decoding as soon as there is just one byte available
114    blocking if necessary.
116    Access to data in a #DataPool# may be direct (Using \Ref{get_data}()
117    function) or sequential (See \Ref{get_stream}() function).
119    If the #DataPool# is not connected to anything, that is it contains
120    some real data, this data can be added to it by means of two
121    \Ref{add_data}() functions. One of them adds data sequentially maintaining
122    the offset of the last block of data added by it. The other can store
123    data anywhere. Thus it's important to realize, that there may be "white
124    spots" in the data storage.
126    There is also a way to test if data is available for some given data
127    range (See \Ref{has_data}()). In addition to this mechanism, there are
128    so-called {\em trigger callbacks}, which are called, when there is
129    all data available for a given data range.
131    Let us consider all modes of operation in details:
133    \begin{enumerate}
134       \item {\bf Not connected #DataPool#}. In this mode the #DataPool#
135             contains some real data. As mentioned above, it may be added 
136             by means of two functions \Ref{add_data}() operating independent
137             of each other and allowing to add data sequentially and
138             directly to any place of data storage. It's important to call
139             function \Ref{set_eof}() after all data has been added.
141             Functions like \Ref{get_data}() or \Ref{get_stream}() can
142             be used to obtain direct or sequential access to the data. As
143             long as \Ref{is_eof}() is #FALSE#, #DataPool# will block every
144             reader, which is trying to read unavailable data until it
145             really becomes available. But as soon as \Ref{is_eof}() is
146             #TRUE#, any attempt to read non-existing data will read #0# bytes.
148             Taking into account the fact, that #DataPool# was designed to
149             store DjVu files, which are in IFF formats, it becomes possible
150             to predict the size of the #DataPool# as soon as the first
151             #32# bytes have been added. This is invaluable for estimating
152             download progress. See function \Ref{get_length}() for details.
153             If this estimate fails (which means, that stored data is not
154             in IFF format), \Ref{get_length}() returns #-1#.
156             Triggers may be added and removed by means of \Ref{add_trigger}()
157             and \Ref{del_trigger}() functions. \Ref{add_trigger}() takes
158             a data range. As soon as all data in that data range is
159             available, the trigger callback will be called.
161             All trigger callbacks will be called when #EOF# condition
162             has been set.
164       \item {\bf #DataPool# connected to another #DataPool#}. In this
165             {\em slave} mode you can map a given #DataPool# to any offsets
166             range inside another #DataPool#. You can connect the slave
167             #DataPool# even if there is no data in the master #DataPool#.
168             Any \Ref{get_data}() request will be forwarded to the master
169             #DataPool#, and it will be responsible for blocking readers
170             trying to access unavailable data.
172             The usage of \Ref{add_data}() functions is prohibited for
173             connected #DataPool#s.
175             The offsets range used to map a slave #DataPool# can be fully
176             specified (both start offset and length are positive numbers)
177             or partially specified (the length is negative). In this mode
178             the slave #DataPool# is assumed to extend up to the end
179             of the master #DataPool#.
181             Triggers may be used with slave #DataPool#s as well as with
182             the master ones.
184             Calling \Ref{stop}() function of a slave will stop only the slave
185             (and any other slave connected to it), but not the master.
187             \Ref{set_eof}() function is meaningless for slaves. They obtain
188             the #ByteStream::EndOfFile# status from their master.
190             Depending on the offsets range passed to the constructor,
191             \Ref{get_length}() returns different values. If the length
192             passed to the constructor was positive, then it is returned
193             by \Ref{get_length}() all the time. Otherwise the value returned
194             is either #-1# if master's length is still unknown (it didn't
195             manage to parse IFF data yet) or it is calculated as
196             #masters_length-slave_start#.
198       \item {\bf #DataPool# connected to a file}. This mode is quite similar
199             to the case, when the #DataPool# is connected to another
200             #DataPool#. Similarly, the #DataPool# stores no data inside.
201             It just forwards all \Ref{get_data}() requests to the underlying
202             source (a file in this case). Thus these requests will never
203             block the reader. But they may return #0# if there is no data
204             available at the requested offset.
206             The usage of \Ref{add_data}() functions is meaningless and
207             is prohibited.
209             \Ref{is_eof}() function always returns #TRUE#. Thus \Ref{set_eof}()
210             us meaningless and does nothing.
212             \Ref{get_length}() function always returns the file size.
214             Calling \Ref{stop}() function will stop this #DataPool# and
215             any other slave connected to it.
217             Trigger callbacks passed through \Ref{add_trigger}() function
218             are called immediately.
220             This mode is useful to read and decode DjVu files without reading
221             and storing them in full in memory.
222    \end{enumerate}
225class DataPool : public GPEnabled
227public: // Classes used internally by DataPool
228        // These are declared public to support buggy C++ compilers.
229   class Incrementor;
230   class Reader;
231   class Trigger;
232   class OpenFiles;
233   class OpenFiles_File;
234   class BlockList;
235   class Counter;
237   DataPool(void);
240      /** @name Initialization */
241      //@{
242      /** Default creator. Will prepare #DataPool# for accepting data
243          added through functions \Ref{add_data}(). Use \Ref{connect}()
244          functions if you want to map this #DataPool# to another or
245          to a file. */
246   static GP<DataPool> create(void);
248      /** Creates and initialized the #DataPool# with data from stream #str#.
249          The constructor will read the stream's contents and add them
250          to the pool using the \Ref{add_data}() function. Afterwards it
251          will call \Ref{set_eof}() function, and no other data will be
252          allowed to be added to the pool. */
253   static GP<DataPool> create(const GP<ByteStream> & str);
255      /** Initializes the #DataPool# in slave mode and connects it
256          to the specified offsets range of the specified master #DataPool#.
257          It is equivalent to calling default constructor and function
258          \Ref{connect}().
260          @param master_pool Master #DataPool# providing data for this slave
261          @param start Beginning of the offsets range which the slave is
262                 mapped into
263          @param length Length of the offsets range. If negative, the range
264                 is assumed to extend up to the end of the master #DataPool#.
265      */
266   static GP<DataPool> create(const GP<DataPool> & master_pool, int start=0, int length=-1);
268      /** Initializes the #DataPool# in slave mode and connects it
269          to the specified offsets range of the specified file.
270          It is equivalent to calling default constructor and function
271          \Ref{connect}().
272          @param url Name of the file to connect to.
273          @param start Beginning of the offsets range which the #DataPool# is
274                 mapped into
275          @param length Length of the offsets range. If negative, the range
276                 is assumed to extend up to the end of the file.
277      */
278   static GP<DataPool> create(const GURL &url, int start=0, int length=-1);
280   virtual ~DataPool();
282      /** Switches the #DataPool# to slave mode and connects it to the
283          specified offsets range of the master #DataPool#.
284          @param master_pool Master #DataPool# providing data for this slave
285          @param start Beginning of the offsets range which the slave is
286                 mapped into
287          @param length Length of the offsets range. If negative, the range
288                 is assumed to extend up to the end of the master #DataPool#.
289      */
290   void         connect(const GP<DataPool> & master_pool, int start=0, int length=-1);
291      /** Connects the #DataPool# to the specified offsets range of
292          the named #url#.
293          @param url Name of the file to connect to.
294          @param start Beginning of the offsets range which the #DataPool# is
295                 mapped into
296          @param length Length of the offsets range. If negative, the range
297                 is assumed to extend up to the end of the file.
298      */
299   void         connect(const GURL &url, int start=0, int length=-1);
300      //@}
302      /** Tells the #DataPool# to stop serving readers.
304          If #only_blocked# flag is #TRUE# then only those requests will
305          be processed, which would not block. Any attempt to get non-existing
306          data would result in a #STOP# exception (instead of blocking until
307          data is available).
309          If #only_blocked# flag is #FALSE# then any further attempt to read
310          from this #DataPool# (as well as from any #DataPool# connected
311          to this one) will result in a #STOP# exception. */
312   void         stop(bool only_blocked=false);
314      /** @name Adding data.
315          Please note, that these functions are for not connected #DataPool#s
316          only. You can not add data to a #DataPool#, which is connected
317          to another #DataPool# or to a file.
318        */
319      //@{
320      /** Appends the new block of data to the #DataPool#. There are two
321          \Ref{add_data}() functions available. One is for adding data
322          sequentially. It keeps track of the last byte position, which has
323          been stored {\bf by it} and always appends the next block after
324          this position. The other \Ref{add_data}() can store data anywhere.
326          The function will unblock readers waiting for data if this data
327          arrives with this block. It may also trigger some {\em trigger
328          callbacks}, which may have been added by means of \Ref{add_trigger}()
329          function.
331          {\bf Note:} After all the data has been added, it's necessary
332          to call \Ref{set_eof}() to tell the #DataPool# that nothing else
333          is expected.
335          {\bf Note:} This function may not be called if the #DataPool#
336          has been connected to something.
338          @param buffer data to append
339          @param size length of the {\em buffer}
340      */
341   void         add_data(const void * buffer, int size);
343      /** Stores the specified block of data at the specified offset.
344          Like the function above this one can also unblock readers
345          waiting for data and engage trigger callbacks. The difference
346          is that {\bf this} function can store data anywhere.
348          {\bf Note:} After all the data has been added, it's necessary
349          to call \Ref{set_eof}() to tell the #DataPool# that nothing else
350          is expected.
352          {\bf Note:} This function may not be called if the #DataPool#
353          has been connected to something.
355          @param buffer data to store
356          @param offset where to store the data
357          @param size length of the {\em buffer} */
358   void         add_data(const void * buffer, int offset, int size);
360      /** Tells the #DataPool# that all data has been added and nothing else
361          is anticipated. When #EOF# is true, any reader attempting to read
362          non existing data will not be blocked. It will either read #ZERO#
363          bytes or will get an #ByteStream::EndOfFile# exception (see \Ref{get_data}()).
364          Calling this function will also activate all registered trigger
365          callbacks.
367          {\bf Note:} This function is meaningless and does nothing
368          when the #DataPool# is connected to another #DataPool# or to
369          a file. */
370   void         set_eof(void);
371      //@}
373      /** @name Accessing data.
374          These functions provide direct and sequential access to the
375          data of the #DataPool#. If the #DataPool# is not connected
376          (contains some real data) then it handles the requests itself.
377          Otherwise they are forwarded to the master #DataPool# or the file.
378        */
379      //@{
380      /** Attempts to return a block of data at the given #offset#
381          of the given #size#.
383          \begin{enumerate}
384             \item If the #DataPool# is connected to another #DataPool# or
385                   to a file, the request will just be forwarded to them.
386             \item If the #DataPool# is not connected to anything and
387                   some of the data requested is in the internal buffer,
388                   the function copies available data to #buffer# and returns
389                   immediately.
391                   If there is no data available, and \Ref{is_eof}() returns
392                   #FALSE#, the reader (and the thread) will be {\bf blocked}
393                   until the data actually arrives. Please note, that since
394                   the reader is blocked, it should run in a separate thread
395                   so that other threads have a chance to call \Ref{add_data}().
396                   If there is no data available, but \Ref{is_eof}() is #TRUE#
397                   the behavior is different and depends on the #DataPool#'s
398                   estimate of the file size:
399                   \begin{itemize}
400                      \item If #DataPool# learns from the IFF structure of the
401                            data, that its size should be greater than it
402                            really is, then any attempt to read non-existing
403                            data in the range of {\em valid} offsets will
404                            result in an #ByteStream::EndOfFile# exception. This is done to
405                            indicate, that there was an error in adding data,
406                            and the data requested is {\bf supposed} to be
407                            there, but has actually not been added.
408                      \item If #DataPool#'s expectations about the data size
409                            coincide with the reality then any attempt to
410                            read data beyond the legal range of offsets will
411                            result in #ZERO# bytes returned.
412                   \end{itemize}.
413          \end{enumerate}.
415          @param buffer Buffer to be filled with data
416          @param offset Offset in the #DataPool# to read data at
417          @param size Size of the {\em buffer}
418          @return The number of bytes actually read
419          @exception STOP The stream has been stopped
420          @exception EOF The requested data is not there and will not be added,
421                     although it should have been.
422      */
423   int          get_data(void * buffer, int offset, int size);
425      /** Returns a \Ref{ByteStream} to access contents of the #DataPool#
426          sequentially. By reading from the returned stream you basically
427          call \Ref{get_data}() function. Thus, everything said for it
428          remains true for the stream too. */
429   GP<ByteStream>       get_stream(void);
430      //@}
432      /** @name State querying functions. */
433      //@{
434      /** Returns #TRUE# if this #DataPool# is connected to another #DataPool#
435          or to a file. */
436   bool         is_connected(void) const;
438      /** Returns #TRUE# if all data available for offsets from
439          #start# till #start+length-1#. If #length# is negative, the
440          range is assumed to extend up to the end of the #DataPool#.
441          This function works both for connected and not connected #DataPool#s.
442          Once it returned #TRUE# for some offsets range, you can be
443          sure that the subsequent \Ref{get_data}() request will not block.
444      */
445   bool         has_data(int start, int length);
447      /* Returns #TRUE# if no more data is planned to be added.
449         {\bf Note:} This function always returns #TRUE# when the #DataPool#
450         has been initialized with a file name. */
451   bool         is_eof(void) const {return eof_flag;}
453      /** Returns the {\em length} of data in the #DataPool#. The value
454          returned depends on the mode of operation:
455          \begin{itemize}
456             \item If the #DataPool# is not connected to anything then
457                   the length returned is either calculated by interpreting
458                   the IFF structure of stored data (if successful) or
459                   by calculating the real size of data after \Ref{set_eof}()
460                   has been called. Otherwise it is #-1#.
461             \item If the #DataPool# is connected to a file, the length
462                   is calculated basing on the length passed to the
463                   \Ref{connect}() function and the file size.
464             \item If the #DataPool# is connected to a master #DataPool#,
465                   the length is calculated basing on the value returned
466                   by the master's #get_length()# function and the length
467                   passed to the \Ref{connect}() function.
468          \end{itemize}. */
469   int          get_length(void) const;
470      /** Returns the number of bytes of data available in this #DataPool#.
471          Contrary to the \Ref{get_length}() function, this one doesn't try
472          to interpret the IFF structure and predict the file length.
473          It just returns the number of bytes of data really available inside
474          the #DataPool#, if it contains data, or inside its range, if it's
475          connected to another #DataPool# or a file. */
476   int          get_size(void) const {return get_size(0, -1);}
477      //@}
479      /** @name Trigger callbacks.
480          {\em Trigger callbacks} are special callbacks called when
481          all data for the given range of offsets has been made available.
482          Since reading unavailable data may result in a thread block,
483          which may be bad, the usage of {\em trigger callbacks} appears
484          to be a convenient way to signal availability of data.
486          You can add a trigger callback in two ways:
487          \begin{enumerate}
488             \item By specifying a range. This is the most general case
489             \item By providing just one {\em threshold}. In this case
490                   the range is assumed to start from offset #ZERO# and
491                   last for {\em threshold}+1 bytes.
492          \end{enumerate}
493        */
494      //@{
495      /** Associates the specified {\em trigger callback} with the
496          given data range.
498          {\bf Note:} The callback may be called immediately if all
499          data for the given range is already available or #EOF# is #TRUE#.
501          @param start The beginning of the range for which all data
502                 should be available
503          @param length If the {\em length} is not negative then the callback
504                 will be called when there is data available for every
505                 offset from {\em start} to {\em start+length-1}.
506                 If {\em thresh} is negative, the callback is called after
507                 #EOF# condition has been set.
508          @param callback Function to call
509          @param cl_data Argument to pass to the callback when it's called. */
510   void         add_trigger(int start, int length,
511//                          void (* callback)(GP<GPEnabled> &), GP<GPEnabled> cl_data);
512                            void (* callback)(void *), void * cl_data);
514      /** Associates the specified {\em trigger callback} with the
515          specified threshold.
517          This function is a simplified version of the function above.
518          The callback will be called when there is data available for
519          every offset from #0# to #thresh#, if #thresh# is positive, or
520          when #EOF# condition has been set otherwise. */
521//   void               add_trigger(int thresh, void (* callback)(GP<GPEnabled> &), GP<GPEnabled> cl_data);
522   void         add_trigger(int thresh, void (* callback)(void *), void * cl_data);
524      /** Use this function to unregister callbacks, which are no longer
525          needed. {\bf Note!} It's important to do it when the client
526          is about to be destroyed. */
527   void         del_trigger(void (* callback)(void *), void *  cl_data);
528//   void               del_trigger(void (* callback)(GP<GPEnabled> &), GP<GPEnabled>  cl_data);
529      //@}
531      /** Loads data from the file into memory. This function is only useful
532          for #DataPool#s getting data from a file. It descends the #DataPool#s
533          hierarchy until it either reaches a file-connected #DataPool#
534          or #DataPool# containing the real data. In the latter case it
535          does nothing, in the first case it makes the #DataPool# read all
536          data from the file into memory and stop using the file.
538          This may be useful when you want to overwrite the file and leave
539          existing #DataPool#s with valid data. */
540   void         load_file(void);
541      /** This function will make every #DataPool# in the program, which
542          is connected to a file, to load the file contents to the main
543          memory and close the file. This feature is important when you
544          want to do something with the file like remove or overwrite it
545          not affecting the rest of the program. */
546   static void  load_file(const GURL &url);
548      /** This function will remove OpenFiles filelist. */
549   static void  close_all(void);
551      // Internal. Used by 'OpenFiles'
552   void         clear_stream(const bool release = true);
554      /** Useful in comparing data pools.  Returns true if dirived from
555          same URL or bytestream. */
556   bool simple_compare(DataPool &pool) const;
558   bool         eof_flag;
559   bool         stop_flag;
560   bool         stop_blocked_flag;
562   Counter      *active_readers;
564      // Source or storage of data
565   GP<DataPool>         pool;
566   GURL         furl;
567   GP<OpenFiles_File>   fstream;
568   GCriticalSection     class_stream_lock;
569   GP<ByteStream>       data;
570   GCriticalSection     data_lock;
571   BlockList            *block_list;
572   int                  add_at;
573   int                  start, length;
575      // List of readers waiting for data
576   GPList<Reader>       readers_list;
577   GCriticalSection     readers_lock;
579      // Triggers
580   GPList<Trigger>      triggers_list;          // List of passed or our triggers
581   GCriticalSection     triggers_lock;          // Lock for the list above
582   GCriticalSection     trigger_lock;           // Lock for static_trigger_cb()
584   void         init(void);
585   void         wait_for_data(const GP<Reader> & reader);
586   void         wake_up_all_readers(void);
587   void         check_triggers(void);
588   int          get_data(void * buffer, int offset, int size, int level);
589   int          get_size(int start, int length) const;
590   void         restart_readers(void);
592//   static void        static_trigger_cb(GP<GPEnabled> &);
593   static void  static_trigger_cb(void *);
594   void         trigger_cb(void);
595   void         analyze_iff(void);
596   void         added_data(const int offset, const int size);
598  static const char *Stop;
599  friend class FCPools;
602inline bool 
603DataPool::simple_compare(DataPool &pool) const
605  // return true if these pools are identical.  False means they may or may
606  // not be identical.
607  return (this == &pool)
608    ||(furl.is_valid()&&!furl.is_empty()&&pool.furl.is_valid()&&(furl == pool.furl))
609    ||(data && (data ==;
612inline bool
613DataPool::is_connected(void) const
615   return furl.is_local_file_url() || pool!=0;
624using namespace DJVU;
625# endif
Note: See TracBrowser for help on using the repository browser.