source: trunk/poppler/mypoppler/poppler/TextOutputDev.h @ 27

Last change on this file since 27 was 27, checked in by Eugene Romanenko, 16 years ago

poppler updated to version 0.5.2, also needed changes to be compatible with new poppler

File size: 19.1 KB
Line 
1//========================================================================
2//
3// TextOutputDev.h
4//
5// Copyright 1997-2003 Glyph & Cog, LLC
6//
7//========================================================================
8
9#ifndef TEXTOUTPUTDEV_H
10#define TEXTOUTPUTDEV_H
11
12#ifdef USE_GCC_PRAGMAS
13#pragma interface
14#endif
15
16#include "poppler-config.h"
17#include <stdio.h>
18#include "goo/gtypes.h"
19#include "GfxFont.h"
20#include "GfxState.h"
21#include "OutputDev.h"
22
23class GooString;
24class GooList;
25class Gfx;
26class GfxFont;
27class GfxState;
28class UnicodeMap;
29
30class TextWord;
31class TextPool;
32class TextLine;
33class TextLineFrag;
34class TextBlock;
35class TextFlow;
36class TextWordList;
37class TextPage;
38class TextSelectionVisitor;
39
40//------------------------------------------------------------------------
41
42typedef void (*TextOutputFunc)(void *stream, char *text, int len);
43
44//------------------------------------------------------------------------
45// TextFontInfo
46//------------------------------------------------------------------------
47
48class TextFontInfo {
49public:
50
51  TextFontInfo(GfxState *state);
52  ~TextFontInfo();
53
54  GBool matches(GfxState *state);
55
56private:
57
58  GfxFont *gfxFont;
59#if TEXTOUT_WORD_LIST
60  GooString *fontName;
61#endif
62
63  friend class TextWord;
64  friend class TextPage;
65  friend class TextSelectionPainter;
66};
67
68//------------------------------------------------------------------------
69// TextWord
70//------------------------------------------------------------------------
71
72class TextWord {
73public:
74
75  // Constructor.
76  TextWord(GfxState *state, int rotA, double x0, double y0,
77           int charPosA, TextFontInfo *fontA, double fontSize);
78
79  // Destructor.
80  ~TextWord();
81
82  // Add a character to the word.
83  void addChar(GfxState *state, double x, double y,
84               double dx, double dy, CharCode c, Unicode u);
85
86  // Merge <word> onto the end of <this>.
87  void merge(TextWord *word);
88
89  // Compares <this> to <word>, returning -1 (<), 0 (=), or +1 (>),
90  // based on a primary-axis comparison, e.g., x ordering if rot=0.
91  int primaryCmp(TextWord *word);
92
93  // Return the distance along the primary axis between <this> and
94  // <word>.
95  double primaryDelta(TextWord *word);
96
97  static int cmpYX(const void *p1, const void *p2);
98
99  void visitSelection(TextSelectionVisitor *visitor,
100                      PDFRectangle *selection);
101
102#if TEXTOUT_WORD_LIST
103  int getLength() { return len; }
104  const Unicode *getChar(int idx) { return &text[idx]; }
105  GooString *getText();
106  GooString *getFontName() { return font->fontName; }
107  void getColor(double *r, double *g, double *b)
108    { *r = colorR; *g = colorG; *b = colorB; }
109  void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA)
110    { *xMinA = xMin; *yMinA = yMin; *xMaxA = xMax; *yMaxA = yMax; }
111  double getFontSize() { return fontSize; }
112  int getRotation() { return rot; }
113  int getCharPos() { return charPos; }
114  int getCharLen() { return charLen; }
115#endif
116  double getEdge(int i) { return edge[i]; }
117  double getBaseline () { return base; }
118  GBool hasSpaceAfter  () { return spaceAfter; }
119  TextWord* nextWord () { return next; };
120private:
121
122  int rot;                      // rotation, multiple of 90 degrees
123                                //   (0, 1, 2, or 3)
124  double xMin, xMax;            // bounding box x coordinates
125  double yMin, yMax;            // bounding box y coordinates
126  double base;                  // baseline x or y coordinate
127  Unicode *text;                // the text
128  CharCode *charcode;           // glyph indices
129  double *edge;                 // "near" edge x or y coord of each char
130                                //   (plus one extra entry for the last char)
131  int len;                      // length of text and edge arrays
132  int size;                     // size of text and edge arrays
133  int charPos;                  // character position (within content stream)
134  int charLen;                  // number of content stream characters in
135                                //   this word
136  TextFontInfo *font;           // font information
137  double fontSize;              // font size
138  GBool spaceAfter;             // set if there is a space between this
139                                //   word and the next word on the line
140  TextWord *next;               // next word in line
141
142#if TEXTOUT_WORD_LIST
143  double colorR,                // word color
144         colorG,
145         colorB;
146#endif
147
148  friend class TextPool;
149  friend class TextLine;
150  friend class TextBlock;
151  friend class TextFlow;
152  friend class TextWordList;
153  friend class TextPage;
154
155  friend class TextSelectionPainter;
156  friend class TextSelectionDumper;
157};
158
159//------------------------------------------------------------------------
160// TextPool
161//------------------------------------------------------------------------
162
163class TextPool {
164public:
165
166  TextPool();
167  ~TextPool();
168
169  TextWord *getPool(int baseIdx) { return pool[baseIdx - minBaseIdx]; }
170  void setPool(int baseIdx, TextWord *p) { pool[baseIdx - minBaseIdx] = p; }
171
172  int getBaseIdx(double base);
173
174  void addWord(TextWord *word);
175
176private:
177
178  int minBaseIdx;               // min baseline bucket index
179  int maxBaseIdx;               // max baseline bucket index
180  TextWord **pool;              // array of linked lists, one for each
181                                //   baseline value (multiple of 4 pts)
182  TextWord *cursor;             // pointer to last-accessed word
183  int cursorBaseIdx;            // baseline bucket index of last-accessed word
184
185  friend class TextBlock;
186  friend class TextPage;
187};
188
189struct TextFlowData;
190
191//------------------------------------------------------------------------
192// TextLine
193//------------------------------------------------------------------------
194
195class TextLine {
196public:
197
198  TextLine(TextBlock *blkA, int rotA, double baseA);
199  ~TextLine();
200
201  void addWord(TextWord *word);
202
203  // Return the distance along the primary axis between <this> and
204  // <line>.
205  double primaryDelta(TextLine *line);
206
207  // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
208  // based on a primary-axis comparison, e.g., x ordering if rot=0.
209  int primaryCmp(TextLine *line);
210
211  // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
212  // based on a secondary-axis comparison of the baselines, e.g., y
213  // ordering if rot=0.
214  int secondaryCmp(TextLine *line);
215
216  int cmpYX(TextLine *line);
217
218  static int cmpXY(const void *p1, const void *p2);
219
220  void coalesce(UnicodeMap *uMap);
221
222  void visitSelection(TextSelectionVisitor *visitor,
223                      PDFRectangle *selection);
224
225private:
226
227  TextBlock *blk;               // parent block
228  int rot;                      // text rotation
229  double xMin, xMax;            // bounding box x coordinates
230  double yMin, yMax;            // bounding box y coordinates
231  double base;                  // baseline x or y coordinate
232  TextWord *words;              // words in this line
233  TextWord *lastWord;           // last word in this line
234  Unicode *text;                // Unicode text of the line, including
235                                //   spaces between words
236  double *edge;                 // "near" edge x or y coord of each char
237                                //   (plus one extra entry for the last char)
238  int *col;                     // starting column number of each Unicode char
239  int len;                      // number of Unicode chars
240  int convertedLen;             // total number of converted characters
241  GBool hyphenated;             // set if last char is a hyphen
242  TextLine *next;               // next line in block
243  Unicode *normalized;          // normalized form of Unicode text
244  int normalized_len;           // number of normalized Unicode chars
245  int *normalized_idx;          // indices of normalized chars into Unicode text
246
247  friend class TextLineFrag;
248  friend class TextBlock;
249  friend class TextFlow;
250  friend class TextWordList;
251  friend class TextPage;
252
253  friend class TextSelectionPainter;
254  friend class TextSelectionSizer;
255  friend class TextSelectionDumper;
256};
257
258//------------------------------------------------------------------------
259// TextBlock
260//------------------------------------------------------------------------
261
262class TextBlock {
263public:
264
265  TextBlock(TextPage *pageA, int rotA);
266  ~TextBlock();
267
268  void addWord(TextWord *word);
269
270  void coalesce(UnicodeMap *uMap);
271
272  // Update this block's priMin and priMax values, looking at <blk>.
273  void updatePriMinMax(TextBlock *blk);
274
275  static int cmpXYPrimaryRot(const void *p1, const void *p2);
276
277  static int cmpYXPrimaryRot(const void *p1, const void *p2);
278
279  int primaryCmp(TextBlock *blk);
280
281  double secondaryDelta(TextBlock *blk);
282
283  // Returns true if <this> is below <blk>, relative to the page's
284  // primary rotation.
285  GBool isBelow(TextBlock *blk);
286
287  void visitSelection(TextSelectionVisitor *visitor,
288                      PDFRectangle *selection);
289
290private:
291
292  TextPage *page;               // the parent page
293  int rot;                      // text rotation
294  double xMin, xMax;            // bounding box x coordinates
295  double yMin, yMax;            // bounding box y coordinates
296  double priMin, priMax;        // whitespace bounding box along primary axis
297
298  TextPool *pool;               // pool of words (used only until lines
299                                //   are built)
300  TextLine *lines;              // linked list of lines
301  TextLine *curLine;            // most recently added line
302  int nLines;                   // number of lines
303  int charCount;                // number of characters in the block
304  int col;                      // starting column
305  int nColumns;                 // number of columns in the block
306
307  TextBlock *next;
308  TextBlock *stackNext;
309
310  friend class TextLine;
311  friend class TextLineFrag;
312  friend class TextFlow;
313  friend class TextWordList;
314  friend class TextPage;
315  friend class TextSelectionPainter;
316};
317
318//------------------------------------------------------------------------
319// TextFlow
320//------------------------------------------------------------------------
321
322class TextFlow {
323public:
324
325  TextFlow(TextPage *pageA, TextBlock *blk);
326  ~TextFlow();
327
328  // Add a block to the end of this flow.
329  void addBlock(TextBlock *blk);
330
331  // Returns true if <blk> fits below <prevBlk> in the flow, i.e., (1)
332  // it uses a font no larger than the last block added to the flow,
333  // and (2) it fits within the flow's [priMin, priMax] along the
334  // primary axis.
335  GBool blockFits(TextBlock *blk, TextBlock *prevBlk);
336
337private:
338
339  TextPage *page;               // the parent page
340  double xMin, xMax;            // bounding box x coordinates
341  double yMin, yMax;            // bounding box y coordinates
342  double priMin, priMax;        // whitespace bounding box along primary axis
343  TextBlock *blocks;            // blocks in flow
344  TextBlock *lastBlk;           // last block in this flow
345  TextFlow *next;
346
347  friend class TextWordList;
348  friend class TextPage;
349};
350
351#if TEXTOUT_WORD_LIST
352
353//------------------------------------------------------------------------
354// TextWordList
355//------------------------------------------------------------------------
356
357class TextWordList {
358public:
359
360  // Build a flat word list, in content stream order (if
361  // text->rawOrder is true), physical layout order (if <physLayout>
362  // is true and text->rawOrder is false), or reading order (if both
363  // flags are false).
364  TextWordList(TextPage *text, GBool physLayout);
365
366  ~TextWordList();
367
368  // Return the number of words on the list.
369  int getLength();
370
371  // Return the <idx>th word from the list.
372  TextWord *get(int idx);
373
374private:
375
376  GooList *words;
377};
378
379#endif // TEXTOUT_WORD_LIST
380
381//------------------------------------------------------------------------
382// TextPage
383//------------------------------------------------------------------------
384
385class TextPage {
386public:
387
388  // Constructor.
389  TextPage(GBool rawOrderA);
390
391  // Destructor.
392  ~TextPage();
393
394  // Start a new page.
395  void startPage(GfxState *state);
396
397  // End the current page.
398  void endPage();
399
400  // Update the current font.
401  void updateFont(GfxState *state);
402
403  // Begin a new word.
404  void beginWord(GfxState *state, double x0, double y0);
405
406  // Add a character to the current word.
407  void addChar(GfxState *state, double x, double y,
408               double dx, double dy,
409               CharCode c, int nBytes, Unicode *u, int uLen);
410
411  // End the current word, sorting it into the list of words.
412  void endWord();
413
414  // Add a word, sorting it into the list of words.
415  void addWord(TextWord *word);
416
417  // Coalesce strings that look like parts of the same line.
418  void coalesce(GBool physLayout);
419
420  // Find a string.  If <startAtTop> is true, starts looking at the
421  // top of the page; else if <startAtLast> is true, starts looking
422  // immediately after the last find result; else starts looking at
423  // <xMin>,<yMin>.  If <stopAtBottom> is true, stops looking at the
424  // bottom of the page; else if <stopAtLast> is true, stops looking
425  // just before the last find result; else stops looking at
426  // <xMax>,<yMax>.
427  GBool findText(Unicode *s, int len,
428                 GBool startAtTop, GBool stopAtBottom,
429                 GBool startAtLast, GBool stopAtLast,
430                 GBool caseSensitive, GBool backward,
431                 double *xMin, double *yMin,
432                 double *xMax, double *yMax);
433
434  // Get the text which is inside the specified rectangle.
435  GooString *getText(double xMin, double yMin,
436                     double xMax, double yMax);
437
438  void visitSelection(TextSelectionVisitor *visitor,
439                      PDFRectangle *selection);
440
441  void drawSelection(OutputDev *out,
442                     double scale,
443                     int rotation,
444                     PDFRectangle *selection,
445                     GfxColor *glyph_color, GfxColor *box_color);
446
447  GooList *getSelectionRegion(PDFRectangle *selection, double scale);
448
449  GooString *getSelectionText(PDFRectangle *selection);
450
451  // Find a string by character position and length.  If found, sets
452  // the text bounding rectangle and returns true; otherwise returns
453  // false.
454  GBool findCharRange(int pos, int length,
455                      double *xMin, double *yMin,
456                      double *xMax, double *yMax);
457
458  // Dump contents of page to a file.
459  void dump(void *outputStream, TextOutputFunc outputFunc,
460            GBool physLayout);
461
462#if TEXTOUT_WORD_LIST
463  // Build a flat word list, in content stream order (if
464  // this->rawOrder is true), physical layout order (if <physLayout>
465  // is true and this->rawOrder is false), or reading order (if both
466  // flags are false).
467  TextWordList *makeWordList(GBool physLayout);
468#endif
469
470private:
471
472  void clear();
473  void assignColumns(TextLineFrag *frags, int nFrags, int rot);
474  int dumpFragment(Unicode *text, int len, UnicodeMap *uMap, GooString *s);
475
476  GBool rawOrder;               // keep text in content stream order
477
478  double pageWidth, pageHeight; // width and height of current page
479  TextWord *curWord;            // currently active string
480  int charPos;                  // next character position (within content
481                                //   stream)
482  TextFontInfo *curFont;        // current font
483  double curFontSize;           // current font size
484  int nest;                     // current nesting level (for Type 3 fonts)
485  int nTinyChars;               // number of "tiny" chars seen so far
486  GBool lastCharOverlap;        // set if the last added char overlapped the
487                                //   previous char
488
489  TextPool *pools[4];           // a "pool" of TextWords for each rotation
490  TextFlow *flows;              // linked list of flows
491  TextBlock **blocks;           // array of blocks, in yx order
492  int nBlocks;                  // number of blocks
493  int primaryRot;               // primary rotation
494  GBool primaryLR;              // primary direction (true means L-to-R,
495                                //   false means R-to-L)
496  TextWord *rawWords;           // list of words, in raw order (only if
497                                //   rawOrder is set)
498  TextWord *rawLastWord;        // last word on rawWords list
499
500  GooList *fonts;                       // all font info objects used on this
501                                //   page [TextFontInfo]
502
503  double lastFindXMin,          // coordinates of the last "find" result
504         lastFindYMin;
505  GBool haveLastFind;
506
507  friend class TextLine;
508  friend class TextLineFrag;
509  friend class TextBlock;
510  friend class TextFlow;
511  friend class TextWordList;
512  friend class TextSelectionPainter;
513  friend class TextSelectionDumper;
514};
515
516//------------------------------------------------------------------------
517// TextOutputDev
518//------------------------------------------------------------------------
519
520class TextOutputDev: public OutputDev {
521public:
522
523  // Open a text output file.  If <fileName> is NULL, no file is
524  // written (this is useful, e.g., for searching text).  If
525  // <physLayoutA> is true, the original physical layout of the text
526  // is maintained.  If <rawOrder> is true, the text is kept in
527  // content stream order.
528  TextOutputDev(char *fileName, GBool physLayoutA,
529                GBool rawOrderA, GBool append);
530
531  // Create a TextOutputDev which will write to a generic stream.  If
532  // <physLayoutA> is true, the original physical layout of the text
533  // is maintained.  If <rawOrder> is true, the text is kept in
534  // content stream order.
535  TextOutputDev(TextOutputFunc func, void *stream,
536                GBool physLayoutA, GBool rawOrderA);
537
538  // Destructor.
539  virtual ~TextOutputDev();
540
541  // Check if file was successfully created.
542  virtual GBool isOk() { return ok; }
543
544  //---- get info about output device
545
546  // Does this device use upside-down coordinates?
547  // (Upside-down means (0,0) is the top left corner of the page.)
548  virtual GBool upsideDown() { return gTrue; }
549
550  // Does this device use drawChar() or drawString()?
551  virtual GBool useDrawChar() { return gTrue; }
552
553  // Does this device use beginType3Char/endType3Char?  Otherwise,
554  // text in Type 3 fonts will be drawn with drawChar/drawString.
555  virtual GBool interpretType3Chars() { return gFalse; }
556
557  // Does this device need non-text content?
558  virtual GBool needNonText() { return gFalse; }
559
560  //----- initialization and control
561
562  // Start a page.
563  virtual void startPage(int pageNum, GfxState *state);
564
565  // End a page.
566  virtual void endPage();
567
568  //----- update text state
569  virtual void updateFont(GfxState *state);
570
571  //----- text drawing
572  virtual void beginString(GfxState *state, GooString *s);
573  virtual void endString(GfxState *state);
574  virtual void drawChar(GfxState *state, double x, double y,
575                        double dx, double dy,
576                        double originX, double originY,
577                        CharCode c, int nBytes, Unicode *u, int uLen);
578
579  //----- special access
580
581  // Find a string.  If <startAtTop> is true, starts looking at the
582  // top of the page; else if <startAtLast> is true, starts looking
583  // immediately after the last find result; else starts looking at
584  // <xMin>,<yMin>.  If <stopAtBottom> is true, stops looking at the
585  // bottom of the page; else if <stopAtLast> is true, stops looking
586  // just before the last find result; else stops looking at
587  // <xMax>,<yMax>.
588  GBool findText(Unicode *s, int len,
589                 GBool startAtTop, GBool stopAtBottom,
590                 GBool startAtLast, GBool stopAtLast,
591                 GBool caseSensitive, GBool backward,
592                 double *xMin, double *yMin,
593                 double *xMax, double *yMax);
594
595  // Get the text which is inside the specified rectangle.
596  GooString *getText(double xMin, double yMin,
597                   double xMax, double yMax);
598
599  // Find a string by character position and length.  If found, sets
600  // the text bounding rectangle and returns true; otherwise returns
601  // false.
602  GBool findCharRange(int pos, int length,
603                      double *xMin, double *yMin,
604                      double *xMax, double *yMax);
605
606  void drawSelection(OutputDev *out, double scale, int rotation,
607                     PDFRectangle *selection,
608                     GfxColor *glyph_color, GfxColor *box_color);
609
610  GooList *getSelectionRegion(PDFRectangle *selection, double scale);
611
612  GooString *getSelectionText(PDFRectangle *selection);
613
614#if TEXTOUT_WORD_LIST
615  // Build a flat word list, in content stream order (if
616  // this->rawOrder is true), physical layout order (if
617  // this->physLayout is true and this->rawOrder is false), or reading
618  // order (if both flags are false).
619  TextWordList *makeWordList();
620#endif
621
622  // Returns the TextPage object for the last rasterized page,
623  // transferring ownership to the caller.
624  TextPage *takeText();
625
626private:
627
628  TextOutputFunc outputFunc;    // output function
629  void *outputStream;           // output stream
630  GBool needClose;              // need to close the output file?
631                                //   (only if outputStream is a FILE*)
632  TextPage *text;               // text for the current page
633  GBool physLayout;             // maintain original physical layout when
634                                //   dumping text
635  GBool rawOrder;               // keep text in content stream order
636  GBool ok;                     // set up ok?
637};
638
639#endif
Note: See TracBrowser for help on using the repository browser.