1 | /*
|
---|
2 | Data structures for encoding transformations.
|
---|
3 |
|
---|
4 | Perl works internally in either a native 'byte' encoding or
|
---|
5 | in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t"
|
---|
6 | representation. When we do we can use utf8_to_uv().
|
---|
7 |
|
---|
8 | Most character encodings are either simple byte mappings or
|
---|
9 | variable length multi-byte encodings. UTF-8 can be viewed as a
|
---|
10 | rather extreme case of the latter.
|
---|
11 |
|
---|
12 | So to solve an important part of perl's encode needs we need to solve the
|
---|
13 | "multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
|
---|
14 | case. (Where one of multi-bytes will usually be UTF-8.)
|
---|
15 |
|
---|
16 | The other type of encoding is a shift encoding where a prefix sequence
|
---|
17 | determines what subsequent bytes mean. Such encodings have state.
|
---|
18 |
|
---|
19 | We also need to handle case where a character in one encoding has to be
|
---|
20 | represented as multiple characters in the other. e.g. letter+diacritic.
|
---|
21 |
|
---|
22 | The process can be considered as pseudo perl:
|
---|
23 |
|
---|
24 | my $dst = '';
|
---|
25 | while (length($src))
|
---|
26 | {
|
---|
27 | my $size = $count($src);
|
---|
28 | my $in_seq = substr($src,0,$size,'');
|
---|
29 | my $out_seq = $s2d_hash{$in_seq};
|
---|
30 | if (defined $out_seq)
|
---|
31 | {
|
---|
32 | $dst .= $out_seq;
|
---|
33 | }
|
---|
34 | else
|
---|
35 | {
|
---|
36 | # an error condition
|
---|
37 | }
|
---|
38 | }
|
---|
39 | return $dst;
|
---|
40 |
|
---|
41 | That has the following components:
|
---|
42 | &src_count - a "rule" for how many bytes make up the next character in the
|
---|
43 | source.
|
---|
44 | %s2d_hash - a mapping from input sequences to output sequences
|
---|
45 |
|
---|
46 | The problem with that scheme is that it does not allow the output
|
---|
47 | character repertoire to affect the characters considered from the
|
---|
48 | input.
|
---|
49 |
|
---|
50 | So we use a "trie" representation which can also be considered
|
---|
51 | a state machine:
|
---|
52 |
|
---|
53 | my $dst = '';
|
---|
54 | my $seq = \@s2d_seq;
|
---|
55 | my $next = \@s2d_next;
|
---|
56 | while (length($src))
|
---|
57 | {
|
---|
58 | my $byte = $substr($src,0,1,'');
|
---|
59 | my $out_seq = $seq->[$byte];
|
---|
60 | if (defined $out_seq)
|
---|
61 | {
|
---|
62 | $dst .= $out_seq;
|
---|
63 | }
|
---|
64 | else
|
---|
65 | {
|
---|
66 | # an error condition
|
---|
67 | }
|
---|
68 | ($next,$seq) = @$next->[$byte] if $next;
|
---|
69 | }
|
---|
70 | return $dst;
|
---|
71 |
|
---|
72 | There is now a pair of data structures to represent everything.
|
---|
73 | It is valid for output sequence at a particular point to
|
---|
74 | be defined but zero length, that just means "don't know yet".
|
---|
75 | For the single byte case there is no 'next' so new tables will be the same as
|
---|
76 | the original tables. For a multi-byte case a prefix byte will flip to the tables
|
---|
77 | for the next page (adding nothing to the output), then the tables for the page
|
---|
78 | will provide the actual output and set tables back to original base page.
|
---|
79 |
|
---|
80 | This scheme can also handle shift encodings.
|
---|
81 |
|
---|
82 | A slight enhancement to the scheme also allows for look-ahead - if
|
---|
83 | we add a flag to re-add the removed byte to the source we could handle
|
---|
84 | a" -> ä
|
---|
85 | ab -> a (and take b back please)
|
---|
86 |
|
---|
87 | */
|
---|
88 |
|
---|
89 | #include <EXTERN.h>
|
---|
90 | #include <perl.h>
|
---|
91 | #define U8 U8
|
---|
92 | #include "encode.h"
|
---|
93 |
|
---|
94 | int
|
---|
95 | do_encode(encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
|
---|
96 | STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
|
---|
97 | {
|
---|
98 | const U8 *s = src;
|
---|
99 | const U8 *send = s + *slen;
|
---|
100 | const U8 *last = s;
|
---|
101 | U8 *d = dst;
|
---|
102 | U8 *dend = d + dlen, *dlast = d;
|
---|
103 | int code = 0;
|
---|
104 | while (s < send) {
|
---|
105 | encpage_t *e = enc;
|
---|
106 | U8 byte = *s;
|
---|
107 | while (byte > e->max)
|
---|
108 | e++;
|
---|
109 | if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
|
---|
110 | const U8 *cend = s + (e->slen & 0x7f);
|
---|
111 | if (cend <= send) {
|
---|
112 | STRLEN n;
|
---|
113 | if ((n = e->dlen)) {
|
---|
114 | const U8 *out = e->seq + n * (byte - e->min);
|
---|
115 | U8 *oend = d + n;
|
---|
116 | if (dst) {
|
---|
117 | if (oend <= dend) {
|
---|
118 | while (d < oend)
|
---|
119 | *d++ = *out++;
|
---|
120 | }
|
---|
121 | else {
|
---|
122 | /* Out of space */
|
---|
123 | code = ENCODE_NOSPACE;
|
---|
124 | break;
|
---|
125 | }
|
---|
126 | }
|
---|
127 | else
|
---|
128 | d = oend;
|
---|
129 | }
|
---|
130 | enc = e->next;
|
---|
131 | s++;
|
---|
132 | if (s == cend) {
|
---|
133 | if (approx && (e->slen & 0x80))
|
---|
134 | code = ENCODE_FALLBACK;
|
---|
135 | last = s;
|
---|
136 | if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
|
---|
137 | code = ENCODE_FOUND_TERM;
|
---|
138 | break;
|
---|
139 | }
|
---|
140 | dlast = d;
|
---|
141 | }
|
---|
142 | }
|
---|
143 | else {
|
---|
144 | /* partial source character */
|
---|
145 | code = ENCODE_PARTIAL;
|
---|
146 | break;
|
---|
147 | }
|
---|
148 | }
|
---|
149 | else {
|
---|
150 | /* Cannot represent */
|
---|
151 | code = ENCODE_NOREP;
|
---|
152 | break;
|
---|
153 | }
|
---|
154 | }
|
---|
155 | *slen = last - src;
|
---|
156 | *dout = d - dst;
|
---|
157 | return code;
|
---|
158 | }
|
---|