source: trunk/libdjvu/BSByteStream.h @ 199

Last change on this file since 199 was 17, checked in by Eugene Romanenko, 16 years ago

update makefiles, remove absolute paths, update djvulibre to version 3.5.17

File size: 11.3 KB
Line 
1//C-  -*- C++ -*-
2//C- -------------------------------------------------------------------
3//C- DjVuLibre-3.5
4//C- Copyright (c) 2002  Leon Bottou and Yann Le Cun.
5//C- Copyright (c) 2001  AT&T
6//C-
7//C- This software is subject to, and may be distributed under, the
8//C- GNU General Public License, Version 2. The license should have
9//C- accompanied the software or you may obtain a copy of the license
10//C- from the Free Software Foundation at http://www.fsf.org .
11//C-
12//C- This program is distributed in the hope that it will be useful,
13//C- but WITHOUT ANY WARRANTY; without even the implied warranty of
14//C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15//C- GNU General Public License for more details.
16//C-
17//C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library
18//C- distributed by Lizardtech Software.  On July 19th 2002, Lizardtech
19//C- Software authorized us to replace the original DjVu(r) Reference
20//C- Library notice by the following text (see doc/lizard2002.djvu):
21//C-
22//C-  ------------------------------------------------------------------
23//C- | DjVu (r) Reference Library (v. 3.5)
24//C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved.
25//C- | The DjVu Reference Library is protected by U.S. Pat. No.
26//C- | 6,058,214 and patents pending.
27//C- |
28//C- | This software is subject to, and may be distributed under, the
29//C- | GNU General Public License, Version 2. The license should have
30//C- | accompanied the software or you may obtain a copy of the license
31//C- | from the Free Software Foundation at http://www.fsf.org .
32//C- |
33//C- | The computer code originally released by LizardTech under this
34//C- | license and unmodified by other parties is deemed "the LIZARDTECH
35//C- | ORIGINAL CODE."  Subject to any third party intellectual property
36//C- | claims, LizardTech grants recipient a worldwide, royalty-free,
37//C- | non-exclusive license to make, use, sell, or otherwise dispose of
38//C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the
39//C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU
40//C- | General Public License.   This grant only confers the right to
41//C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to
42//C- | the extent such infringement is reasonably necessary to enable
43//C- | recipient to make, have made, practice, sell, or otherwise dispose
44//C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to
45//C- | any greater extent that may be necessary to utilize further
46//C- | modifications or combinations.
47//C- |
48//C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY
49//C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
50//C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF
51//C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
52//C- +------------------------------------------------------------------
53//
54// $Id: BSByteStream.h,v 1.8 2003/11/07 22:08:20 leonb Exp $
55// $Name:  $
56
57#ifndef _BSBYTESTREAM_H
58#define _BSBYTESTREAM_H
59#ifdef HAVE_CONFIG_H
60#include "config.h"
61#endif
62#if NEED_GNUG_PRAGMAS
63# pragma interface
64#endif
65
66/** @name BSByteStream.h
67   
68    Files #"BSByteStream.h"# and #"BSByteStream.cpp"# implement a very compact
69    general purpose compressor based on the Burrows-Wheeler transform.  The
70    utility program \Ref{bzz} provides a front-end for this class. Although
71    this compression model is not currently used in DjVu files, it may be used
72    in the future for encoding textual data chunks.
73
74    {\bf Algorithms} --- The Burrows-Wheeler transform (also named Block-Sorting)
75    is performed using a combination of the Karp-Miller-Rosenberg and the
76    Bentley-Sedgewick algorithms. This is comparable to (Sadakane, DCC 98)
77    with a slightly more flexible ranking scheme. Symbols are then ordered
78    according to a running estimate of their occurrence frequencies.  The
79    symbol ranks are then coded using a simple fixed tree and the
80    \Ref{ZPCodec} binary adaptive coder.
81
82    {\bf Performances} --- The basic algorithm is mostly similar to those
83    implemented in well known compressors like #bzip# or #bzip2#
84    (\URL{http://www.muraroa.demon.co.uk}).  The adaptive binary coder however
85    generates small differences. The adaptation noise may cost up to 5\% in
86    file size, but this penalty is usually offset by the benefits of
87    adaptation.  This is good when processing large and highly structured
88    files like spreadsheet files.  Compression and decompression speed is
89    about twice slower than #bzip2# but the sorting algorithms is more
90    robust. Unlike #bzip2# (as of August 1998), this code can compress half a
91    megabyte of "abababab...." in bounded time.
92   
93    Here are some comparative results (in bits per character) obtained on the
94    Canterbury Corpus (\URL{http://corpus.canterbury.ac.nz}) as of August
95    1998. The BSByteStream performance on the single spreadsheet file #Excl#
96    moves #bzz#'s weighted average ahead of much more sophisticated methods,
97    like Suzanne Bunton's #fsmxBest# system
98    \URL{http://corpus.canterbury.ac.nz/methodinfo/fsmx.html}.  This result
99    will not last very long.
100
101    {\footnotesize
102    \begin{tabular}{lccccccccccccc}
103      & text & fax & Csrc & Excl & SPRC & tech
104      & poem & html & lisp & man & play & Weighted & Average \\
105      compress
106      & 3.27 & 0.97 & 3.56 & 2.41 & 4.21 & 3.06
107      & 3.38 & 3.68 & 3.90 & 4.43 & 3.51
108      & 2.55 & 3.31 \\
109      gzip -9
110      & 2.85 & 0.82 & 2.24 & 1.63 & 2.67 & 2.71
111      & 3.23 & 2.59 & 2.65 & 3.31 & 3.12
112      & 2.08 & 2.53 \\ 
113      bzip2 -9
114      & 2.27 & 0.78 & 2.18 & 1.01 & 2.70 & 2.02
115      & 2.42 & 2.48 & 2.79 & 3.33 & 2.53
116      & 1.54 & 2.23 \\
117      ppmd
118      & 2.31 & 0.99 & 2.11 & 1.08 & 2.68 & 2.19
119      & 2.48 & 2.38 & 2.43 & 3.00 & 2.53
120      & 1.65 & 2.20 \\
121      fsmx
122      & {\bf 2.10} & 0.79 & {\bf 1.89} & 1.48 & {\bf 2.52} & {\bf 1.84}
123      & {\bf 2.21} & {\bf 2.24} & {\bf 2.29} & {\bf 2.91} & {\bf 2.35}
124      & 1.63 & {\bf 2.06} \\
125      {\bf bzz}
126      & 2.25 & {\bf 0.76} & 2.13 & {\bf 0.78} & 2.67 & 2.00
127      & 2.40 & 2.52 & 2.60 & 3.19 & 2.52
128      & {\bf 1.44} & 2.16
129    \end{tabular}
130    }
131
132    Note that the DjVu people have several entries in this table.  Program
133    #compress# was written some time ago by Joe Orost
134    (\URL{http://www.research.att.com/info/orost}). The #ppmc# method, (a
135    precursor of #ppmd#) was created by Paul Howard
136    (\URL{http://www.research.att.com/info/pgh}). The #bzz# program is just
137    below your eyes.
138
139    @author
140    L\'eon Bottou <leonb@research.att.com> -- Initial implementation\\
141    Andrei Erofeev <eaf@geocities.com> -- Improved Block Sorting algorithm.
142    @memo
143    Simple Burrows-Wheeler general purpose compressor.
144    @version
145    #$Id: BSByteStream.h,v 1.8 2003/11/07 22:08:20 leonb Exp $# */
146//@{
147
148
149#include "ByteStream.h"
150#include "GException.h"
151#include "ZPCodec.h"
152
153
154#ifdef HAVE_NAMESPACES
155namespace DJVU {
156# ifdef NOT_DEFINED // Just to fool emacs c++ mode
157}
158#endif
159#endif
160
161
162/** Performs bzz compression/decompression.
163   
164    Class #BSByteStream# defines a \Ref{ByteStream} which transparently
165    performs the BZZ compression/decompression. The constructor of class
166    \Ref{BSByteStream} takes another \Ref{ByteStream} as argument.  Any data
167    written to the BSByteStream is compressed and written to this second
168    ByteStream. Any data read from the BSByteStream is internally generated by
169    decompressing data read from the second ByteStream.
170
171    Program \Ref{bzz} demonstrates how to use this class.  All the hard work
172    is achieved by a simple ByteStream to ByteStream copy, as shown below.
173    \begin{verbatim}
174      GP<ByteStream> in=ByteStream::create(infile,"rb");
175      GP<ByteStream> out=ByteStream::create(outfile,"wb");
176      if (encoding) {
177          BSByteStream bsb(out, blocksize);
178          bsb.copy(*in);
179      } else {
180          BSByteStream bsb(in);
181          out->copy(bsb);
182      }
183    \end{verbatim}
184    Due to the block oriented nature of the Burrows-Wheeler transform, there
185    is a very significant latency between the data input and the data output.
186    You can use function #flush# to force data output at the expense of
187    compression efficiency.
188
189    You should never directly access a ByteStream object connected to a valid
190    BSByteStream object. The ByteStream object can be accessed again after the
191    destruction of the BSByteStream object.  Note that the encoder always
192    flushes its internal buffers and writes a few final code bytes when the
193    BSByteStream object is destroyed.  Note also that the decoder often reads
194    a few bytes beyond the last code byte written by the encoder.  This lag
195    means that you must reposition the ByteStream after the destruction of the
196    BSByteStream object and before re-using the ByteStream object (see
197    \Ref{IFFByteStream}.)
198*/
199class BSByteStream : public ByteStream
200{
201public:
202// Limits on block sizes
203  enum { MINBLOCK=10, MAXBLOCK=4096 };
204
205// Sorting tresholds
206  enum { FREQMAX=4, CTXIDS=3 };
207
208  class Decode;
209  class Encode;
210protected:
211  BSByteStream(GP<ByteStream> bs);
212
213public:
214  /** Creates a BSByteStream.
215      The BSByteStream will be used for decompressing data.
216      \begin{description}
217      \item[Decompression]
218      The BSByteStream is created and the decompressor initializes.  Chunks of
219      data will be read from ByteStream #bs# and decompressed into an internal
220      buffer. Function #read# can be used to access the decompressed data.
221      \end{description} */
222  static GP<ByteStream> create(GP<ByteStream> bs);
223
224  /** Constructs a BSByteStream.
225      The BSByteStream will be used for compressing data.
226      \begin{description}
227      \item[Compression]
228      Set #blocksize# to a positive number smaller than 4096 to
229      initialize the compressor.  Data written to the BSByteStream will be
230      accumulated into an internal buffer.  The buffered data will be
231      compressed and written to ByteStream #bs# whenever the buffer sizes
232      reaches the maximum value specified by argument #blocksize# (in
233      kilobytes).  Using a larger block size usually increases the compression
234      ratio at the expense of computation time.  There is no need however to
235      specify a block size larger than the total number of bytes to compress.
236      Setting #blocksize# to #1024# is a good starting point.  A minimal block
237      size of 10 is silently enforced.
238      \end{description} */
239  static GP<ByteStream> create(GP<ByteStream> bs, const int blocksize);
240
241  // ByteStream Interface
242  ~BSByteStream();
243  virtual long tell(void) const;
244  virtual void flush(void) = 0;
245protected:
246  // Data
247  long            offset;
248  int             bptr;
249  unsigned int    blocksize;
250  int             size;
251  ByteStream *bs;
252  GP<ByteStream> gbs;
253  unsigned char  *data;
254  GPBuffer<unsigned char> gdata;
255  // Coder
256  GP<ZPCodec> gzp;
257  BitContext ctx[300];
258private: 
259  // Cancel C++ default stuff
260  BSByteStream(const BSByteStream &);
261  BSByteStream & operator=(const BSByteStream &);
262  BSByteStream(ByteStream *);
263  BSByteStream(ByteStream *, int);
264};
265
266//@}
267
268
269#ifdef HAVE_NAMESPACES
270}
271# ifndef NOT_USING_DJVU_NAMESPACE
272using namespace DJVU;
273# endif
274#endif
275#endif
Note: See TracBrowser for help on using the repository browser.