source: trunk/libdjvu/DjVuText.h @ 199

Last change on this file since 199 was 17, checked in by Eugene Romanenko, 16 years ago

update makefiles, remove absolute paths, update djvulibre to version 3.5.17

File size: 10.9 KB
Line 
1//C-  -*- C++ -*-
2//C- -------------------------------------------------------------------
3//C- DjVuLibre-3.5
4//C- Copyright (c) 2002  Leon Bottou and Yann Le Cun.
5//C- Copyright (c) 2001  AT&T
6//C-
7//C- This software is subject to, and may be distributed under, the
8//C- GNU General Public License, Version 2. The license should have
9//C- accompanied the software or you may obtain a copy of the license
10//C- from the Free Software Foundation at http://www.fsf.org .
11//C-
12//C- This program is distributed in the hope that it will be useful,
13//C- but WITHOUT ANY WARRANTY; without even the implied warranty of
14//C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15//C- GNU General Public License for more details.
16//C-
17//C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library
18//C- distributed by Lizardtech Software.  On July 19th 2002, Lizardtech
19//C- Software authorized us to replace the original DjVu(r) Reference
20//C- Library notice by the following text (see doc/lizard2002.djvu):
21//C-
22//C-  ------------------------------------------------------------------
23//C- | DjVu (r) Reference Library (v. 3.5)
24//C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved.
25//C- | The DjVu Reference Library is protected by U.S. Pat. No.
26//C- | 6,058,214 and patents pending.
27//C- |
28//C- | This software is subject to, and may be distributed under, the
29//C- | GNU General Public License, Version 2. The license should have
30//C- | accompanied the software or you may obtain a copy of the license
31//C- | from the Free Software Foundation at http://www.fsf.org .
32//C- |
33//C- | The computer code originally released by LizardTech under this
34//C- | license and unmodified by other parties is deemed "the LIZARDTECH
35//C- | ORIGINAL CODE."  Subject to any third party intellectual property
36//C- | claims, LizardTech grants recipient a worldwide, royalty-free,
37//C- | non-exclusive license to make, use, sell, or otherwise dispose of
38//C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the
39//C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU
40//C- | General Public License.   This grant only confers the right to
41//C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to
42//C- | the extent such infringement is reasonably necessary to enable
43//C- | recipient to make, have made, practice, sell, or otherwise dispose
44//C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to
45//C- | any greater extent that may be necessary to utilize further
46//C- | modifications or combinations.
47//C- |
48//C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY
49//C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
50//C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF
51//C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
52//C- +------------------------------------------------------------------
53//
54// $Id: DjVuText.h,v 1.10 2003/11/07 22:08:21 leonb Exp $
55// $Name:  $
56
57#ifndef _DJVUTEXT_H
58#define _DJVUTEXT_H
59#ifdef HAVE_CONFIG_H
60#include "config.h"
61#endif
62#if NEED_GNUG_PRAGMAS
63# pragma interface
64#endif
65
66
67
68/** @name DjVuText.h
69
70    Files #"DjVuText.h"# and #"DjVuText.cpp"# implement the mechanism for
71    text in DjVuImages.
72
73    This file implements annotations understood by the DjVu plugins
74    and encoders.
75
76
77    using: contents of #TXT*# chunks.
78
79    Contents of the #FORM:TEXT# should be passed to \Ref{DjVuText::decode}()
80    for parsing, which initializes \Ref{DjVuText::TXT}
81    and fills them with decoded data.
82    @memo Implements support for DjVuImage hidden text.
83    @author Andrei Erofeev <eaf@geocities.com>
84    @version
85    #$Id: DjVuText.h,v 1.10 2003/11/07 22:08:21 leonb Exp $# */
86//@{
87
88
89#include "GMapAreas.h"
90
91#ifdef HAVE_NAMESPACES
92namespace DJVU {
93# ifdef NOT_DEFINED // Just to fool emacs c++ mode
94}
95#endif
96#endif
97
98
99class ByteStream;
100
101// -------- DJVUTXT --------
102
103/** Description of the text contained in a DjVu page.  This class contains the
104    textual data for the page.  It describes the text as a hierarchy of zones
105    corresponding to page, column, region, paragraph, lines, words, etc...
106    The piece of text associated with each zone is represented by an offset
107    and a length describing a segment of a global UTF8 encoded string.  */
108
109class DjVuTXT : public GPEnabled
110{
111protected:
112  DjVuTXT(void) {}
113public:
114  /// Default creator
115  static GP<DjVuTXT> create(void) {return new DjVuTXT();}
116
117  /** These constants are used to tell what a zone describes.
118      This can be useful for a copy/paste application.
119      The deeper we go into the hierarchy, the higher the constant. */
120  enum ZoneType { PAGE=1, COLUMN=2, REGION=3, PARAGRAPH=4, 
121                  LINE=5, WORD=6, CHARACTER=7 };
122  /** Data structure representing document textual components.
123      The text structure is represented by a hierarchy of rectangular zones. */
124  class Zone
125  {
126  public:
127    Zone();
128    /** Type of the zone. */
129    enum ZoneType ztype;
130    /** Rectangle spanned by the zone */
131    GRect rect;
132    /** Position of the zone text in string #textUTF8#. */
133    int text_start;
134    /** Length of the zone text in string #textUTF8#. */
135    int text_length;
136    /** List of children zone. */
137    GList<Zone> children;
138    /** Appends another subzone inside this zone.  The new zone is initialized
139        with an empty rectangle, empty text, and has the same type as this
140        zone. */
141    Zone *append_child();
142    /** Find the text_start and text_end indicated by the given box. */
143    void get_text_with_rect(const GRect &box, 
144                            int &string_start,int &string_end ) const;
145    /** Find the zones used by the specified string and append them to the list. */
146    void find_zones(GList<Zone *> &list, 
147                    const int string_start, const int string_end) const;
148    /** Finds the smallest rectangles and appends them to the list. */
149    void get_smallest(GList<GRect> &list) const;
150    /** Finds the smallest rectangles and appends them to the list after
151        padding the smallest unit to fit width or height for the parent rectangle
152        and adding the number of specified pixels. */
153    void get_smallest(GList<GRect> &list,const int padding) const;
154    /// Find out this Zone's parent.
155    const Zone *get_parent(void) const;
156  private:
157    friend class DjVuTXT;
158    const Zone *zone_parent;
159    void cleartext();
160    void normtext(const char *instr, GUTF8String &outstr);
161    unsigned int memuse() const;
162    static const int version;
163    void encode(const GP<ByteStream> &bs, 
164                const Zone * parent=0, const Zone * prev=0) const;
165    void decode(const GP<ByteStream> &bs, int maxtext,
166                const Zone * parent=0, const Zone * prev=0);
167  };
168  /** Textual data for this page. 
169      The content of this string is encoded using the UTF8 code.
170      This code corresponds to ASCII for the first 127 characters.
171      Columns, regions, paragraph and lines are delimited by the following
172      control character:
173      \begin{tabular}{lll}
174        {\bf Name} & {\bf Octal} & {\bf Ascii name} \\\hline\\
175        {\tt DjVuText::end_of_column}    & 013 & VT, Vertical Tab \\
176        {\tt DjVuText::end_of_region}    & 035 & GS, Group Separator \\
177        {\tt DjVuText::end_of_paragraph} & 037 & US, Unit Separator \\
178        {\tt DjVuText::end_of_line}      & 012 & LF: Line Feed
179      \end{tabular} */
180  GUTF8String textUTF8;
181  static const char end_of_column    ;      // VT: Vertical Tab
182  static const char end_of_region    ;      // GS: Group Separator
183  static const char end_of_paragraph ;      // US: Unit Separator
184  static const char end_of_line      ;      // LF: Line Feed
185  /** Main zone in the document.
186      This zone represent the page. */
187  Zone page_zone;
188  /** Tests whether there is a meaningful zone hierarchy. */
189  int has_valid_zones() const;
190  /** Normalize textual data.  Assuming that a zone hierarchy has been built
191      and represents the reading order.  This function reorganizes the string
192      #textUTF8# by gathering the highest level text available in the zone
193      hierarchy.  The text offsets and lengths are recomputed for all the
194      zones in the hierarchy. Separators are inserted where appropriate. */
195  void normalize_text();
196  /** Encode data for a TXT chunk. */
197  void encode(const GP<ByteStream> &bs) const;
198  /** Decode data from a TXT chunk. */
199  void decode(const GP<ByteStream> &bs);
200  /** Returns a copy of this object. */
201  GP<DjVuTXT> copy(void) const;
202  /// Write XML formated text.
203  void writeText(ByteStream &bs,const int height) const;
204  /// Get XML formatted text.
205  GUTF8String get_xmlText(const int height) const;
206  /** Find the text specified by the rectangle. */ 
207  GList<Zone*> find_text_in_rect(GRect target_rect, GUTF8String &text) const;
208  /** Find the text specified by the rectangle. */
209  GList<GRect> find_text_with_rect(const GRect &box, GUTF8String &text, const int padding=0) const;
210  /** Get all zones of zone type zone_type under node parent.
211      zone_list contains the return value. */
212  void get_zones(int zone_type, const Zone *parent, GList<Zone *> & zone_list) const;
213  /** Returns the number of bytes needed by this data structure. It's
214      used by caching routines to estimate the size of a \Ref{DjVuImage}. */
215  unsigned int get_memory_usage() const;
216};
217
218inline const DjVuTXT::Zone *
219DjVuTXT::Zone::get_parent(void) const
220{
221  return zone_parent;
222}
223
224
225class DjVuText : public GPEnabled
226{
227protected:
228   DjVuText(void) {}
229public:
230   /// Default creator.
231   static GP<DjVuText> create(void) {return new DjVuText();}
232
233      /** Decodes a sequence of annotation chunks and merges contents of every
234          chunk with previously decoded information. This function
235          should be called right after applying \Ref{IFFByteStream::get_chunk}()
236          to data from #FORM:TEXT#. */
237   void decode(const GP<ByteStream> &bs);
238
239      /** Encodes all annotations back into a sequence of chunks to be put
240          inside a #FORM:TEXT#. */
241   void encode(const GP<ByteStream> &bs);
242
243      /// Returns a copy of this object
244   GP<DjVuText> copy(void) const;
245
246      /** Returns the number of bytes needed by this data structure. It's
247          used by caching routines to estimate the size of a \Ref{DjVuImage}. */
248   inline unsigned int get_memory_usage() const;
249
250   /// Write XML formated text.
251   void writeText(ByteStream &bs,const int height) const;
252
253   /// Get XML formatted text.
254   GUTF8String get_xmlText(const int height) const;
255
256   GP<DjVuTXT>  txt;
257private: // dummy stuff
258   static void decode(ByteStream *);
259   static void  encode(ByteStream *);
260};
261
262//@}
263
264inline unsigned int
265DjVuText::get_memory_usage() const
266{
267  return (txt)?(txt->get_memory_usage()):0;
268}
269
270
271// ----- THE END
272
273#ifdef HAVE_NAMESPACES
274}
275# ifndef NOT_USING_DJVU_NAMESPACE
276using namespace DJVU;
277# endif
278#endif
279#endif
280
281
Note: See TracBrowser for help on using the repository browser.