source: trunk/libdjvu/BSByteStream.h @ 209

Last change on this file since 209 was 206, checked in by Eugene Romanenko, 14 years ago

DJVU plugin: djvulibre updated to version 3.5.19

File size: 11.2 KB
Line 
1//C-  -*- C++ -*-
2//C- -------------------------------------------------------------------
3//C- DjVuLibre-3.5
4//C- Copyright (c) 2002  Leon Bottou and Yann Le Cun.
5//C- Copyright (c) 2001  AT&T
6//C-
7//C- This software is subject to, and may be distributed under, the
8//C- GNU General Public License, either Version 2 of the license,
9//C- or (at your option) any later version. The license should have
10//C- accompanied the software or you may obtain a copy of the license
11//C- from the Free Software Foundation at http://www.fsf.org .
12//C-
13//C- This program is distributed in the hope that it will be useful,
14//C- but WITHOUT ANY WARRANTY; without even the implied warranty of
15//C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16//C- GNU General Public License for more details.
17//C-
18//C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library from
19//C- Lizardtech Software.  Lizardtech Software has authorized us to
20//C- replace the original DjVu(r) Reference Library notice by the following
21//C- text (see doc/lizard2002.djvu and doc/lizardtech2007.djvu):
22//C-
23//C-  ------------------------------------------------------------------
24//C- | DjVu (r) Reference Library (v. 3.5)
25//C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved.
26//C- | The DjVu Reference Library is protected by U.S. Pat. No.
27//C- | 6,058,214 and patents pending.
28//C- |
29//C- | This software is subject to, and may be distributed under, the
30//C- | GNU General Public License, either Version 2 of the license,
31//C- | or (at your option) any later version. The license should have
32//C- | accompanied the software or you may obtain a copy of the license
33//C- | from the Free Software Foundation at http://www.fsf.org .
34//C- |
35//C- | The computer code originally released by LizardTech under this
36//C- | license and unmodified by other parties is deemed "the LIZARDTECH
37//C- | ORIGINAL CODE."  Subject to any third party intellectual property
38//C- | claims, LizardTech grants recipient a worldwide, royalty-free,
39//C- | non-exclusive license to make, use, sell, or otherwise dispose of
40//C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the
41//C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU
42//C- | General Public License.   This grant only confers the right to
43//C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to
44//C- | the extent such infringement is reasonably necessary to enable
45//C- | recipient to make, have made, practice, sell, or otherwise dispose
46//C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to
47//C- | any greater extent that may be necessary to utilize further
48//C- | modifications or combinations.
49//C- |
50//C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY
51//C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
52//C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF
53//C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
54//C- +------------------------------------------------------------------
55//
56// $Id: BSByteStream.h,v 1.9 2007/03/25 20:48:29 leonb Exp $
57// $Name: release_3_5_19 $
58
59#ifndef _BSBYTESTREAM_H
60#define _BSBYTESTREAM_H
61#ifdef HAVE_CONFIG_H
62#include "config.h"
63#endif
64#if NEED_GNUG_PRAGMAS
65# pragma interface
66#endif
67
68/** @name BSByteStream.h
69   
70    Files #"BSByteStream.h"# and #"BSByteStream.cpp"# implement a very compact
71    general purpose compressor based on the Burrows-Wheeler transform.  The
72    utility program \Ref{bzz} provides a front-end for this class. Although
73    this compression model is not currently used in DjVu files, it may be used
74    in the future for encoding textual data chunks.
75
76    {\bf Algorithms} --- The Burrows-Wheeler transform (also named Block-Sorting)
77    is performed using a combination of the Karp-Miller-Rosenberg and the
78    Bentley-Sedgewick algorithms. This is comparable to (Sadakane, DCC 98)
79    with a slightly more flexible ranking scheme. Symbols are then ordered
80    according to a running estimate of their occurrence frequencies.  The
81    symbol ranks are then coded using a simple fixed tree and the
82    \Ref{ZPCodec} binary adaptive coder.
83
84    {\bf Performances} --- The basic algorithm is mostly similar to those
85    implemented in well known compressors like #bzip# or #bzip2#
86    (\URL{http://www.muraroa.demon.co.uk}).  The adaptive binary coder however
87    generates small differences. The adaptation noise may cost up to 5\% in
88    file size, but this penalty is usually offset by the benefits of
89    adaptation.  This is good when processing large and highly structured
90    files like spreadsheet files.  Compression and decompression speed is
91    about twice slower than #bzip2# but the sorting algorithms is more
92    robust. Unlike #bzip2# (as of August 1998), this code can compress half a
93    megabyte of "abababab...." in bounded time.
94   
95    Here are some comparative results (in bits per character) obtained on the
96    Canterbury Corpus (\URL{http://corpus.canterbury.ac.nz}) as of August
97    1998. The BSByteStream performance on the single spreadsheet file #Excl#
98    moves #bzz#'s weighted average ahead of much more sophisticated methods,
99    like Suzanne Bunton's #fsmxBest# system
100    \URL{http://corpus.canterbury.ac.nz/methodinfo/fsmx.html}.  This result
101    will not last very long.
102
103    {\footnotesize
104    \begin{tabular}{lccccccccccccc}
105      & text & fax & Csrc & Excl & SPRC & tech
106      & poem & html & lisp & man & play & Weighted & Average \\
107      compress
108      & 3.27 & 0.97 & 3.56 & 2.41 & 4.21 & 3.06
109      & 3.38 & 3.68 & 3.90 & 4.43 & 3.51
110      & 2.55 & 3.31 \\
111      gzip -9
112      & 2.85 & 0.82 & 2.24 & 1.63 & 2.67 & 2.71
113      & 3.23 & 2.59 & 2.65 & 3.31 & 3.12
114      & 2.08 & 2.53 \\ 
115      bzip2 -9
116      & 2.27 & 0.78 & 2.18 & 1.01 & 2.70 & 2.02
117      & 2.42 & 2.48 & 2.79 & 3.33 & 2.53
118      & 1.54 & 2.23 \\
119      ppmd
120      & 2.31 & 0.99 & 2.11 & 1.08 & 2.68 & 2.19
121      & 2.48 & 2.38 & 2.43 & 3.00 & 2.53
122      & 1.65 & 2.20 \\
123      fsmx
124      & {\bf 2.10} & 0.79 & {\bf 1.89} & 1.48 & {\bf 2.52} & {\bf 1.84}
125      & {\bf 2.21} & {\bf 2.24} & {\bf 2.29} & {\bf 2.91} & {\bf 2.35}
126      & 1.63 & {\bf 2.06} \\
127      {\bf bzz}
128      & 2.25 & {\bf 0.76} & 2.13 & {\bf 0.78} & 2.67 & 2.00
129      & 2.40 & 2.52 & 2.60 & 3.19 & 2.52
130      & {\bf 1.44} & 2.16
131    \end{tabular}
132    }
133
134    Note that the DjVu people have several entries in this table.  Program
135    #compress# was written some time ago by Joe Orost
136    (\URL{http://www.research.att.com/info/orost}). The #ppmc# method, (a
137    precursor of #ppmd#) was created by Paul Howard
138    (\URL{http://www.research.att.com/info/pgh}). The #bzz# program is just
139    below your eyes.
140
141    @author
142    L\'eon Bottou <leonb@research.att.com> -- Initial implementation\\
143    Andrei Erofeev <eaf@geocities.com> -- Improved Block Sorting algorithm.
144    @memo
145    Simple Burrows-Wheeler general purpose compressor.
146    @version
147    #$Id: BSByteStream.h,v 1.9 2007/03/25 20:48:29 leonb Exp $# */
148//@{
149
150
151#include "ByteStream.h"
152#include "GException.h"
153#include "ZPCodec.h"
154
155
156#ifdef HAVE_NAMESPACES
157namespace DJVU {
158# ifdef NOT_DEFINED // Just to fool emacs c++ mode
159}
160#endif
161#endif
162
163
164/** Performs bzz compression/decompression.
165   
166    Class #BSByteStream# defines a \Ref{ByteStream} which transparently
167    performs the BZZ compression/decompression. The constructor of class
168    \Ref{BSByteStream} takes another \Ref{ByteStream} as argument.  Any data
169    written to the BSByteStream is compressed and written to this second
170    ByteStream. Any data read from the BSByteStream is internally generated by
171    decompressing data read from the second ByteStream.
172
173    Program \Ref{bzz} demonstrates how to use this class.  All the hard work
174    is achieved by a simple ByteStream to ByteStream copy, as shown below.
175    \begin{verbatim}
176      GP<ByteStream> in=ByteStream::create(infile,"rb");
177      GP<ByteStream> out=ByteStream::create(outfile,"wb");
178      if (encoding) {
179          BSByteStream bsb(out, blocksize);
180          bsb.copy(*in);
181      } else {
182          BSByteStream bsb(in);
183          out->copy(bsb);
184      }
185    \end{verbatim}
186    Due to the block oriented nature of the Burrows-Wheeler transform, there
187    is a very significant latency between the data input and the data output.
188    You can use function #flush# to force data output at the expense of
189    compression efficiency.
190
191    You should never directly access a ByteStream object connected to a valid
192    BSByteStream object. The ByteStream object can be accessed again after the
193    destruction of the BSByteStream object.  Note that the encoder always
194    flushes its internal buffers and writes a few final code bytes when the
195    BSByteStream object is destroyed.  Note also that the decoder often reads
196    a few bytes beyond the last code byte written by the encoder.  This lag
197    means that you must reposition the ByteStream after the destruction of the
198    BSByteStream object and before re-using the ByteStream object (see
199    \Ref{IFFByteStream}.)
200*/
201class BSByteStream : public ByteStream
202{
203public:
204// Limits on block sizes
205  enum { MINBLOCK=10, MAXBLOCK=4096 };
206
207// Sorting tresholds
208  enum { FREQMAX=4, CTXIDS=3 };
209
210  class Decode;
211  class Encode;
212protected:
213  BSByteStream(GP<ByteStream> bs);
214
215public:
216  /** Creates a BSByteStream.
217      The BSByteStream will be used for decompressing data.
218      \begin{description}
219      \item[Decompression]
220      The BSByteStream is created and the decompressor initializes.  Chunks of
221      data will be read from ByteStream #bs# and decompressed into an internal
222      buffer. Function #read# can be used to access the decompressed data.
223      \end{description} */
224  static GP<ByteStream> create(GP<ByteStream> bs);
225
226  /** Constructs a BSByteStream.
227      The BSByteStream will be used for compressing data.
228      \begin{description}
229      \item[Compression]
230      Set #blocksize# to a positive number smaller than 4096 to
231      initialize the compressor.  Data written to the BSByteStream will be
232      accumulated into an internal buffer.  The buffered data will be
233      compressed and written to ByteStream #bs# whenever the buffer sizes
234      reaches the maximum value specified by argument #blocksize# (in
235      kilobytes).  Using a larger block size usually increases the compression
236      ratio at the expense of computation time.  There is no need however to
237      specify a block size larger than the total number of bytes to compress.
238      Setting #blocksize# to #1024# is a good starting point.  A minimal block
239      size of 10 is silently enforced.
240      \end{description} */
241  static GP<ByteStream> create(GP<ByteStream> bs, const int blocksize);
242
243  // ByteStream Interface
244  ~BSByteStream();
245  virtual long tell(void) const;
246  virtual void flush(void) = 0;
247protected:
248  // Data
249  long            offset;
250  int             bptr;
251  unsigned int    blocksize;
252  int             size;
253  ByteStream *bs;
254  GP<ByteStream> gbs;
255  unsigned char  *data;
256  GPBuffer<unsigned char> gdata;
257  // Coder
258  GP<ZPCodec> gzp;
259  BitContext ctx[300];
260private: 
261  // Cancel C++ default stuff
262  BSByteStream(const BSByteStream &);
263  BSByteStream & operator=(const BSByteStream &);
264  BSByteStream(ByteStream *);
265  BSByteStream(ByteStream *, int);
266};
267
268//@}
269
270
271#ifdef HAVE_NAMESPACES
272}
273# ifndef NOT_USING_DJVU_NAMESPACE
274using namespace DJVU;
275# endif
276#endif
277#endif
Note: See TracBrowser for help on using the repository browser.