source: trunk/poppler/mypoppler/poppler/TextOutputDev.h @ 2

Last change on this file since 2 was 2, checked in by Eugene Romanenko, 16 years ago

First import

File size: 18.9 KB
Line 
1//========================================================================
2//
3// TextOutputDev.h
4//
5// Copyright 1997-2003 Glyph & Cog, LLC
6//
7//========================================================================
8
9#ifndef TEXTOUTPUTDEV_H
10#define TEXTOUTPUTDEV_H
11
12#ifdef USE_GCC_PRAGMAS
13#pragma interface
14#endif
15
16#include "poppler-config.h"
17#include <stdio.h>
18#include "goo/gtypes.h"
19#include "GfxFont.h"
20#include "GfxState.h"
21#include "OutputDev.h"
22
23class GooString;
24class GooList;
25class Gfx;
26class GfxFont;
27class GfxState;
28class UnicodeMap;
29
30class TextWord;
31class TextPool;
32class TextLine;
33class TextLineFrag;
34class TextBlock;
35class TextFlow;
36class TextWordList;
37class TextPage;
38class TextSelectionVisitor;
39
40//------------------------------------------------------------------------
41
42typedef void (*TextOutputFunc)(void *stream, char *text, int len);
43
44//------------------------------------------------------------------------
45// TextFontInfo
46//------------------------------------------------------------------------
47
48class TextFontInfo {
49public:
50
51  TextFontInfo(GfxState *state);
52  ~TextFontInfo();
53
54  GBool matches(GfxState *state);
55
56private:
57
58  GfxFont *gfxFont;
59#if TEXTOUT_WORD_LIST
60  GooString *fontName;
61#endif
62
63  friend class TextWord;
64  friend class TextPage;
65  friend class TextSelectionPainter;
66};
67
68//------------------------------------------------------------------------
69// TextWord
70//------------------------------------------------------------------------
71
72class TextWord {
73public:
74
75  // Constructor.
76  TextWord(GfxState *state, int rotA, double x0, double y0,
77           int charPosA, TextFontInfo *fontA, double fontSize);
78
79  // Destructor.
80  ~TextWord();
81
82  // Add a character to the word.
83  void addChar(GfxState *state, double x, double y,
84               double dx, double dy, CharCode c, Unicode u);
85
86  // Merge <word> onto the end of <this>.
87  void merge(TextWord *word);
88
89  // Compares <this> to <word>, returning -1 (<), 0 (=), or +1 (>),
90  // based on a primary-axis comparison, e.g., x ordering if rot=0.
91  int primaryCmp(TextWord *word);
92
93  // Return the distance along the primary axis between <this> and
94  // <word>.
95  double primaryDelta(TextWord *word);
96
97  static int cmpYX(const void *p1, const void *p2);
98
99  void visitSelection(TextSelectionVisitor *visitor,
100                      PDFRectangle *selection);
101
102#if TEXTOUT_WORD_LIST
103  int getLength() { return len; }
104  const Unicode *getChar(int idx) { return &text[idx]; }
105  GooString *getText();
106  GooString *getFontName() { return font->fontName; }
107  void getColor(double *r, double *g, double *b)
108    { *r = colorR; *g = colorG; *b = colorB; }
109  void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA)
110    { *xMinA = xMin; *yMinA = yMin; *xMaxA = xMax; *yMaxA = yMax; }
111  double getFontSize() { return fontSize; }
112  int getRotation() { return rot; }
113  int getCharPos() { return charPos; }
114  int getCharLen() { return charLen; }
115#endif
116  double getEdge(int i) { return edge[i]; }
117  double getBaseline () { return base; }
118  GBool hasSpaceAfter  () { return spaceAfter; }
119  TextWord* nextWord () { return next; };
120private:
121
122  int rot;                      // rotation, multiple of 90 degrees
123                                //   (0, 1, 2, or 3)
124  double xMin, xMax;            // bounding box x coordinates
125  double yMin, yMax;            // bounding box y coordinates
126  double base;                  // baseline x or y coordinate
127  Unicode *text;                // the text
128  CharCode *charcode;           // glyph indices
129  double *edge;                 // "near" edge x or y coord of each char
130                                //   (plus one extra entry for the last char)
131  int len;                      // length of text and edge arrays
132  int size;                     // size of text and edge arrays
133  int charPos;                  // character position (within content stream)
134  int charLen;                  // number of content stream characters in
135                                //   this word
136  TextFontInfo *font;           // font information
137  double fontSize;              // font size
138  GBool spaceAfter;             // set if there is a space between this
139                                //   word and the next word on the line
140  TextWord *next;               // next word in line
141
142#if TEXTOUT_WORD_LIST
143  double colorR,                // word color
144         colorG,
145         colorB;
146#endif
147
148  friend class TextPool;
149  friend class TextLine;
150  friend class TextBlock;
151  friend class TextFlow;
152  friend class TextWordList;
153  friend class TextPage;
154
155  friend class TextSelectionPainter;
156  friend class TextSelectionDumper;
157};
158
159//------------------------------------------------------------------------
160// TextPool
161//------------------------------------------------------------------------
162
163class TextPool {
164public:
165
166  TextPool();
167  ~TextPool();
168
169  TextWord *getPool(int baseIdx) { return pool[baseIdx - minBaseIdx]; }
170  void setPool(int baseIdx, TextWord *p) { pool[baseIdx - minBaseIdx] = p; }
171
172  int getBaseIdx(double base);
173
174  void addWord(TextWord *word);
175
176private:
177
178  int minBaseIdx;               // min baseline bucket index
179  int maxBaseIdx;               // max baseline bucket index
180  TextWord **pool;              // array of linked lists, one for each
181                                //   baseline value (multiple of 4 pts)
182  TextWord *cursor;             // pointer to last-accessed word
183  int cursorBaseIdx;            // baseline bucket index of last-accessed word
184
185  friend class TextBlock;
186  friend class TextPage;
187};
188
189struct TextFlowData;
190
191//------------------------------------------------------------------------
192// TextLine
193//------------------------------------------------------------------------
194
195class TextLine {
196public:
197
198  TextLine(TextBlock *blkA, int rotA, double baseA);
199  ~TextLine();
200
201  void addWord(TextWord *word);
202
203  // Return the distance along the primary axis between <this> and
204  // <line>.
205  double primaryDelta(TextLine *line);
206
207  // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
208  // based on a primary-axis comparison, e.g., x ordering if rot=0.
209  int primaryCmp(TextLine *line);
210
211  // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
212  // based on a secondary-axis comparison of the baselines, e.g., y
213  // ordering if rot=0.
214  int secondaryCmp(TextLine *line);
215
216  int cmpYX(TextLine *line);
217
218  static int cmpXY(const void *p1, const void *p2);
219
220  void coalesce(UnicodeMap *uMap);
221
222  void visitSelection(TextSelectionVisitor *visitor,
223                      PDFRectangle *selection);
224
225private:
226
227  TextBlock *blk;               // parent block
228  int rot;                      // text rotation
229  double xMin, xMax;            // bounding box x coordinates
230  double yMin, yMax;            // bounding box y coordinates
231  double base;                  // baseline x or y coordinate
232  TextWord *words;              // words in this line
233  TextWord *lastWord;           // last word in this line
234  Unicode *text;                // Unicode text of the line, including
235                                //   spaces between words
236  double *edge;                 // "near" edge x or y coord of each char
237                                //   (plus one extra entry for the last char)
238  int *col;                     // starting column number of each Unicode char
239  int len;                      // number of Unicode chars
240  int convertedLen;             // total number of converted characters
241  GBool hyphenated;             // set if last char is a hyphen
242  TextLine *next;               // next line in block
243
244  friend class TextLineFrag;
245  friend class TextBlock;
246  friend class TextFlow;
247  friend class TextWordList;
248  friend class TextPage;
249
250  friend class TextSelectionPainter;
251  friend class TextSelectionSizer;
252  friend class TextSelectionDumper;
253};
254
255//------------------------------------------------------------------------
256// TextBlock
257//------------------------------------------------------------------------
258
259class TextBlock {
260public:
261
262  TextBlock(TextPage *pageA, int rotA);
263  ~TextBlock();
264
265  void addWord(TextWord *word);
266
267  void coalesce(UnicodeMap *uMap);
268
269  // Update this block's priMin and priMax values, looking at <blk>.
270  void updatePriMinMax(TextBlock *blk);
271
272  static int cmpXYPrimaryRot(const void *p1, const void *p2);
273
274  static int cmpYXPrimaryRot(const void *p1, const void *p2);
275
276  int primaryCmp(TextBlock *blk);
277
278  double secondaryDelta(TextBlock *blk);
279
280  // Returns true if <this> is below <blk>, relative to the page's
281  // primary rotation.
282  GBool isBelow(TextBlock *blk);
283
284  void visitSelection(TextSelectionVisitor *visitor,
285                      PDFRectangle *selection);
286
287private:
288
289  TextPage *page;               // the parent page
290  int rot;                      // text rotation
291  double xMin, xMax;            // bounding box x coordinates
292  double yMin, yMax;            // bounding box y coordinates
293  double priMin, priMax;        // whitespace bounding box along primary axis
294
295  TextPool *pool;               // pool of words (used only until lines
296                                //   are built)
297  TextLine *lines;              // linked list of lines
298  TextLine *curLine;            // most recently added line
299  int nLines;                   // number of lines
300  int charCount;                // number of characters in the block
301  int col;                      // starting column
302  int nColumns;                 // number of columns in the block
303
304  TextBlock *next;
305  TextBlock *stackNext;
306
307  friend class TextLine;
308  friend class TextLineFrag;
309  friend class TextFlow;
310  friend class TextWordList;
311  friend class TextPage;
312  friend class TextSelectionPainter;
313};
314
315//------------------------------------------------------------------------
316// TextFlow
317//------------------------------------------------------------------------
318
319class TextFlow {
320public:
321
322  TextFlow(TextPage *pageA, TextBlock *blk);
323  ~TextFlow();
324
325  // Add a block to the end of this flow.
326  void addBlock(TextBlock *blk);
327
328  // Returns true if <blk> fits below <prevBlk> in the flow, i.e., (1)
329  // it uses a font no larger than the last block added to the flow,
330  // and (2) it fits within the flow's [priMin, priMax] along the
331  // primary axis.
332  GBool blockFits(TextBlock *blk, TextBlock *prevBlk);
333
334private:
335
336  TextPage *page;               // the parent page
337  double xMin, xMax;            // bounding box x coordinates
338  double yMin, yMax;            // bounding box y coordinates
339  double priMin, priMax;        // whitespace bounding box along primary axis
340  TextBlock *blocks;            // blocks in flow
341  TextBlock *lastBlk;           // last block in this flow
342  TextFlow *next;
343
344  friend class TextWordList;
345  friend class TextPage;
346};
347
348#if TEXTOUT_WORD_LIST
349
350//------------------------------------------------------------------------
351// TextWordList
352//------------------------------------------------------------------------
353
354class TextWordList {
355public:
356
357  // Build a flat word list, in content stream order (if
358  // text->rawOrder is true), physical layout order (if <physLayout>
359  // is true and text->rawOrder is false), or reading order (if both
360  // flags are false).
361  TextWordList(TextPage *text, GBool physLayout);
362
363  ~TextWordList();
364
365  // Return the number of words on the list.
366  int getLength();
367
368  // Return the <idx>th word from the list.
369  TextWord *get(int idx);
370
371private:
372
373  GooList *words;
374};
375
376#endif // TEXTOUT_WORD_LIST
377
378//------------------------------------------------------------------------
379// TextPage
380//------------------------------------------------------------------------
381
382class TextPage {
383public:
384
385  // Constructor.
386  TextPage(GBool rawOrderA);
387
388  // Destructor.
389  ~TextPage();
390
391  // Start a new page.
392  void startPage(GfxState *state);
393
394  // End the current page.
395  void endPage();
396
397  // Update the current font.
398  void updateFont(GfxState *state);
399
400  // Begin a new word.
401  void beginWord(GfxState *state, double x0, double y0);
402
403  // Add a character to the current word.
404  void addChar(GfxState *state, double x, double y,
405               double dx, double dy,
406               CharCode c, int nBytes, Unicode *u, int uLen);
407
408  // End the current word, sorting it into the list of words.
409  void endWord();
410
411  // Add a word, sorting it into the list of words.
412  void addWord(TextWord *word);
413
414  // Coalesce strings that look like parts of the same line.
415  void coalesce(GBool physLayout);
416
417  // Find a string.  If <startAtTop> is true, starts looking at the
418  // top of the page; else if <startAtLast> is true, starts looking
419  // immediately after the last find result; else starts looking at
420  // <xMin>,<yMin>.  If <stopAtBottom> is true, stops looking at the
421  // bottom of the page; else if <stopAtLast> is true, stops looking
422  // just before the last find result; else stops looking at
423  // <xMax>,<yMax>.
424  GBool findText(Unicode *s, int len,
425                 GBool startAtTop, GBool stopAtBottom,
426                 GBool startAtLast, GBool stopAtLast,
427                 GBool caseSensitive, GBool backward,
428                 double *xMin, double *yMin,
429                 double *xMax, double *yMax);
430
431  // Get the text which is inside the specified rectangle.
432  GooString *getText(double xMin, double yMin,
433                     double xMax, double yMax);
434
435  void visitSelection(TextSelectionVisitor *visitor,
436                      PDFRectangle *selection);
437
438  void drawSelection(OutputDev *out,
439                     double scale,
440                     int rotation,
441                     PDFRectangle *selection,
442                     GfxColor *glyph_color, GfxColor *box_color);
443
444  GooList *getSelectionRegion(PDFRectangle *selection, double scale);
445
446  GooString *getSelectionText(PDFRectangle *selection);
447
448  // Find a string by character position and length.  If found, sets
449  // the text bounding rectangle and returns true; otherwise returns
450  // false.
451  GBool findCharRange(int pos, int length,
452                      double *xMin, double *yMin,
453                      double *xMax, double *yMax);
454
455  // Dump contents of page to a file.
456  void dump(void *outputStream, TextOutputFunc outputFunc,
457            GBool physLayout);
458
459#if TEXTOUT_WORD_LIST
460  // Build a flat word list, in content stream order (if
461  // this->rawOrder is true), physical layout order (if <physLayout>
462  // is true and this->rawOrder is false), or reading order (if both
463  // flags are false).
464  TextWordList *makeWordList(GBool physLayout);
465#endif
466
467private:
468
469  void clear();
470  void assignColumns(TextLineFrag *frags, int nFrags, int rot);
471  int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GooString *s);
472
473  GBool rawOrder;               // keep text in content stream order
474
475  double pageWidth, pageHeight; // width and height of current page
476  TextWord *curWord;            // currently active string
477  int charPos;                  // next character position (within content
478                                //   stream)
479  TextFontInfo *curFont;        // current font
480  double curFontSize;           // current font size
481  int nest;                     // current nesting level (for Type 3 fonts)
482  int nTinyChars;               // number of "tiny" chars seen so far
483  GBool lastCharOverlap;        // set if the last added char overlapped the
484                                //   previous char
485
486  TextPool *pools[4];           // a "pool" of TextWords for each rotation
487  TextFlow *flows;              // linked list of flows
488  TextBlock **blocks;           // array of blocks, in yx order
489  int nBlocks;                  // number of blocks
490  int primaryRot;               // primary rotation
491  GBool primaryLR;              // primary direction (true means L-to-R,
492                                //   false means R-to-L)
493  TextWord *rawWords;           // list of words, in raw order (only if
494                                //   rawOrder is set)
495  TextWord *rawLastWord;        // last word on rawWords list
496
497  GooList *fonts;                       // all font info objects used on this
498                                //   page [TextFontInfo]
499
500  double lastFindXMin,          // coordinates of the last "find" result
501         lastFindYMin;
502  GBool haveLastFind;
503
504  friend class TextLine;
505  friend class TextLineFrag;
506  friend class TextBlock;
507  friend class TextFlow;
508  friend class TextWordList;
509  friend class TextSelectionPainter;
510  friend class TextSelectionDumper;
511};
512
513//------------------------------------------------------------------------
514// TextOutputDev
515//------------------------------------------------------------------------
516
517class TextOutputDev: public OutputDev {
518public:
519
520  // Open a text output file.  If <fileName> is NULL, no file is
521  // written (this is useful, e.g., for searching text).  If
522  // <physLayoutA> is true, the original physical layout of the text
523  // is maintained.  If <rawOrder> is true, the text is kept in
524  // content stream order.
525  TextOutputDev(char *fileName, GBool physLayoutA,
526                GBool rawOrderA, GBool append);
527
528  // Create a TextOutputDev which will write to a generic stream.  If
529  // <physLayoutA> is true, the original physical layout of the text
530  // is maintained.  If <rawOrder> is true, the text is kept in
531  // content stream order.
532  TextOutputDev(TextOutputFunc func, void *stream,
533                GBool physLayoutA, GBool rawOrderA);
534
535  // Destructor.
536  virtual ~TextOutputDev();
537
538  // Check if file was successfully created.
539  virtual GBool isOk() { return ok; }
540
541  //---- get info about output device
542
543  // Does this device use upside-down coordinates?
544  // (Upside-down means (0,0) is the top left corner of the page.)
545  virtual GBool upsideDown() { return gTrue; }
546
547  // Does this device use drawChar() or drawString()?
548  virtual GBool useDrawChar() { return gTrue; }
549
550  // Does this device use beginType3Char/endType3Char?  Otherwise,
551  // text in Type 3 fonts will be drawn with drawChar/drawString.
552  virtual GBool interpretType3Chars() { return gFalse; }
553
554  // Does this device need non-text content?
555  virtual GBool needNonText() { return gFalse; }
556
557  //----- initialization and control
558
559  // Start a page.
560  virtual void startPage(int pageNum, GfxState *state);
561
562  // End a page.
563  virtual void endPage();
564
565  //----- update text state
566  virtual void updateFont(GfxState *state);
567
568  //----- text drawing
569  virtual void beginString(GfxState *state, GooString *s);
570  virtual void endString(GfxState *state);
571  virtual void drawChar(GfxState *state, double x, double y,
572                        double dx, double dy,
573                        double originX, double originY,
574                        CharCode c, int nBytes, Unicode *u, int uLen);
575
576  //----- special access
577
578  // Find a string.  If <startAtTop> is true, starts looking at the
579  // top of the page; else if <startAtLast> is true, starts looking
580  // immediately after the last find result; else starts looking at
581  // <xMin>,<yMin>.  If <stopAtBottom> is true, stops looking at the
582  // bottom of the page; else if <stopAtLast> is true, stops looking
583  // just before the last find result; else stops looking at
584  // <xMax>,<yMax>.
585  GBool findText(Unicode *s, int len,
586                 GBool startAtTop, GBool stopAtBottom,
587                 GBool startAtLast, GBool stopAtLast,
588                 GBool caseSensitive, GBool backward,
589                 double *xMin, double *yMin,
590                 double *xMax, double *yMax);
591
592  // Get the text which is inside the specified rectangle.
593  GooString *getText(double xMin, double yMin,
594                   double xMax, double yMax);
595
596  // Find a string by character position and length.  If found, sets
597  // the text bounding rectangle and returns true; otherwise returns
598  // false.
599  GBool findCharRange(int pos, int length,
600                      double *xMin, double *yMin,
601                      double *xMax, double *yMax);
602
603  void drawSelection(OutputDev *out, double scale, int rotation,
604                     PDFRectangle *selection,
605                     GfxColor *glyph_color, GfxColor *box_color);
606
607  GooList *getSelectionRegion(PDFRectangle *selection, double scale);
608
609  GooString *getSelectionText(PDFRectangle *selection);
610
611#if TEXTOUT_WORD_LIST
612  // Build a flat word list, in content stream order (if
613  // this->rawOrder is true), physical layout order (if
614  // this->physLayout is true and this->rawOrder is false), or reading
615  // order (if both flags are false).
616  TextWordList *makeWordList();
617#endif
618
619  // Returns the TextPage object for the last rasterized page,
620  // transferring ownership to the caller.
621  TextPage *takeText();
622
623private:
624
625  TextOutputFunc outputFunc;    // output function
626  void *outputStream;           // output stream
627  GBool needClose;              // need to close the output file?
628                                //   (only if outputStream is a FILE*)
629  TextPage *text;               // text for the current page
630  GBool physLayout;             // maintain original physical layout when
631                                //   dumping text
632  GBool rawOrder;               // keep text in content stream order
633  GBool ok;                     // set up ok?
634};
635
636#endif
Note: See TracBrowser for help on using the repository browser.