forked from coolwanglu/pdf2htmlEX
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHTMLTextLine.h
134 lines (111 loc) · 3.7 KB
/
HTMLTextLine.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
/*
* Header file for HTMLTextLine
* Copyright (C) 2013 Lu Wang <[email protected]>
*/
#ifndef HTMLTEXTLINE_H__
#define HTMLTEXTLINE_H__
#include <ostream>
#include <vector>
#include <CharTypes.h>
#include "Param.h"
#include "StateManager.h"
#include "HTMLState.h"
namespace pdf2htmlEX {
/*
* Store and optimize a line of text in HTML
*
* contains a series of
* - Text
* - Shift
* - State change
*/
class HTMLTextLine
{
public:
HTMLTextLine (const HTMLLineState & line_state, const Param & param, AllStateManager & all_manager);
struct State : public HTMLTextState {
// before output
void begin(std::ostream & out, const State * prev_state);
// after output
void end(std::ostream & out) const;
// calculate the hash code
void hash(void);
// calculate the difference between another State
int diff(const State & s) const;
enum {
FONT_ID,
FONT_SIZE_ID,
FILL_COLOR_ID,
STROKE_COLOR_ID,
LETTER_SPACE_ID,
WORD_SPACE_ID,
HASH_ID_COUNT,
VERTICAL_ALIGN_ID = HASH_ID_COUNT,
ID_COUNT
};
static long long umask_by_id(int id);
long long ids[ID_COUNT];
size_t start_idx; // index of the first Text using this state
// for optimization
long long hash_value;
long long hash_umask; // some states may not be actually used
bool need_close;
static const char * const css_class_names []; // class names for each id
};
struct Offset {
Offset(size_t size_idx, double width)
:start_idx(size_idx),width(width)
{ }
size_t start_idx; // should put this Offset right before text[start_idx];
double width;
};
/**
* Append a drawn char (glyph)'s unicode. l > 1 mean this glyph correspond to
* multiple code points.
*/
void append_unicodes(const Unicode * u, int l, double width);
/**
* Append a special padding char with 0 width, in order to keep char index consistent.
* The padding char is ignored during output.
*/
void append_padding_char() { text.push_back(0); }
void append_offset(double width);
void append_state(const HTMLTextState & text_state);
void dump_text(std::ostream & out);
bool text_empty(void) const { return text.empty(); }
void clear(void);
void clip(const HTMLClipState &);
/*
* Optimize and calculate necessary values
*/
void prepare(void);
void optimize(std::vector<HTMLTextLine*> &);
private:
void optimize_normal(std::vector<HTMLTextLine*> &);
void optimize_aggressive(std::vector<HTMLTextLine*> &);
/**
* Dump chars' unicode to output stream.
* begin/pos is the index in 'text'.
*/
void dump_chars(std::ostream & out, int begin, int len);
void dump_char(std::ostream & out, int pos);
const Param & param;
AllStateManager & all_manager;
HTMLLineState line_state;
double ascent, descent;
double clip_x1, clip_y1;
double width;
std::vector<State> states;
std::vector<Offset> offsets;
/**
* Drawn chars (glyph) in this line are stored in 'text'. For each element c in 'text':
* - If c > 0, it is the unicode code point corresponds to the glyph;
* - If c == 0, it is a padding char, and ignored during output (TODO some bad PDFs utilize 0?);
* - If c < -1, this glyph corresponds to more than one unicode code points,
* which are stored in 'decomposed_text', and (-c-1) is the index in 'decomposed_text'.
*/
std::vector<int> text;
std::vector<std::vector<Unicode> > decomposed_text;
};
} // namespace pdf2htmlEX
#endif //HTMLTEXTLINE_H__