1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
| #pragma pack(push, 1)
char chartoutf[0x80][4]={// CP1251
{0xD0, 0x82, 0x00, 0x00}, /* 'Ђ' = 0x80 */ {0xD0, 0x83, 0x00, 0x00}, /* 'Ѓ' = 0x81 */
{0xE2, 0x80, 0x9A, 0x00}, /* '‚' = 0x82 */ {0xD1, 0x93, 0x00, 0x00}, /* 'ѓ' = 0x83 */
{0xE2, 0x80, 0x9E, 0x00}, /* '„' = 0x84 */ {0xE2, 0x80, 0xA6, 0x00}, /* '…' = 0x85 */
{0xE2, 0x80, 0xA0, 0x00}, /* '†' = 0x86 */ {0xE2, 0x80, 0xA1, 0x00}, /* '‡' = 0x87 */
{0xE2, 0x82, 0xAC, 0x00}, /* '?' = 0x88 */ {0xE2, 0x80, 0xB0, 0x00}, /* '‰' = 0x89 */
{0xD0, 0x89, 0x00, 0x00}, /* 'Љ' = 0x8A */ {0xE2, 0x80, 0xB9, 0x00}, /* '‹' = 0x8B */
{0xD0, 0x8A, 0x00, 0x00}, /* 'Њ' = 0x8C */ {0xD0, 0x8C, 0x00, 0x00}, /* 'Ќ' = 0x8D */
{0xD0, 0x8B, 0x00, 0x00}, /* 'Ћ' = 0x8E */ {0xD0, 0x8F, 0x00, 0x00}, /* 'Џ' = 0x8F */
{0xD1, 0x92, 0x00, 0x00}, /* 'ђ' = 0x90 */ {0xE2, 0x80, 0x98, 0x00}, /* '‘' = 0x91 */
{0xE2, 0x80, 0x99, 0x00}, /* '’' = 0x92 */ {0xE2, 0x80, 0x9C, 0x00}, /* '“' = 0x93 */
{0xE2, 0x80, 0x9D, 0x00}, /* '”' = 0x94 */ {0xE2, 0x80, 0xA2, 0x00}, /* '•' = 0x95 */
{0xE2, 0x80, 0x93, 0x00}, /* '–' = 0x96 */ {0xE2, 0x80, 0x94, 0x00}, /* '—' = 0x97 */
{0xEF, 0xBF, 0xBD, 0x00}, /* '˜' = 0x98 */ {0xE2, 0x84, 0xA2, 0x00}, /* '™' = 0x99 */
{0xD1, 0x99, 0x00, 0x00}, /* 'љ' = 0x9A */ {0xE2, 0x80, 0xBA, 0x00}, /* '›' = 0x9B */
{0xD1, 0x9A, 0x00, 0x00}, /* 'њ' = 0x9C */ {0xD1, 0x9C, 0x00, 0x00}, /* 'ќ' = 0x9D */
{0xD1, 0x9B, 0x00, 0x00}, /* 'ћ' = 0x9E */ {0xD1, 0x9F, 0x00, 0x00}, /* 'џ' = 0x9F */
{0xC2, 0xA0, 0x00, 0x00}, /* '*' = 0xA0 */ {0xD0, 0x8E, 0x00, 0x00}, /* 'Ў' = 0xA1 */
{0xD1, 0x9E, 0x00, 0x00}, /* 'ў' = 0xA2 */ {0xD0, 0x88, 0x00, 0x00}, /* 'Ј' = 0xA3 */
{0xC2, 0xA4, 0x00, 0x00}, /* '¤' = 0xA4 */ {0xD2, 0x90, 0x00, 0x00}, /* 'Ґ' = 0xA5 */
{0xC2, 0xA6, 0x00, 0x00}, /* '¦' = 0xA6 */ {0xC2, 0xA7, 0x00, 0x00}, /* '§' = 0xA7 */
{0xD0, 0x81, 0x00, 0x00}, /* 'Ё' = 0xA8 */ {0xC2, 0xA9, 0x00, 0x00}, /* '©' = 0xA9 */
{0xD0, 0x84, 0x00, 0x00}, /* 'Є' = 0xAA */ {0xC2, 0xAB, 0x00, 0x00}, /* '«' = 0xAB */
{0xC2, 0xAC, 0x00, 0x00}, /* '¬' = 0xAC */ {0xC2, 0xAD, 0x00, 0x00}, /* '*' = 0xAD */
{0xC2, 0xAE, 0x00, 0x00}, /* '®' = 0xAE */ {0xD0, 0x87, 0x00, 0x00}, /* 'Ї' = 0xAF */
{0xC2, 0xB0, 0x00, 0x00}, /* '°' = 0xB0 */ {0xC2, 0xB1, 0x00, 0x00}, /* '±' = 0xB1 */
{0xD0, 0x86, 0x00, 0x00}, /* 'І' = 0xB2 */ {0xD1, 0x96, 0x00, 0x00}, /* 'і' = 0xB3 */
{0xD2, 0x91, 0x00, 0x00}, /* 'ґ' = 0xB4 */ {0xC2, 0xB5, 0x00, 0x00}, /* 'µ' = 0xB5 */
{0xC2, 0xB6, 0x00, 0x00}, /* '¶' = 0xB6 */ {0xC2, 0xB7, 0x00, 0x00}, /* '·' = 0xB7 */
{0xD1, 0x91, 0x00, 0x00}, /* 'ё' = 0xB8 */ {0xE2, 0x84, 0x96, 0x00}, /* '№' = 0xB9 */
{0xD1, 0x94, 0x00, 0x00}, /* 'є' = 0xBA */ {0xC2, 0xBB, 0x00, 0x00}, /* '»' = 0xBB */
{0xD1, 0x98, 0x00, 0x00}, /* 'ј' = 0xBC */ {0xD0, 0x85, 0x00, 0x00}, /* 'Ѕ' = 0xBD */
{0xD1, 0x95, 0x00, 0x00}, /* 'ѕ' = 0xBE */ {0xD1, 0x97, 0x00, 0x00}, /* 'ї' = 0xBF */
{0xD0, 0x90, 0x00, 0x00}, /* 'А' = 0xC0 */ {0xD0, 0x91, 0x00, 0x00}, /* 'Б' = 0xC1 */
{0xD0, 0x92, 0x00, 0x00}, /* 'В' = 0xC2 */ {0xD0, 0x93, 0x00, 0x00}, /* 'Г' = 0xC3 */
{0xD0, 0x94, 0x00, 0x00}, /* 'Д' = 0xC4 */ {0xD0, 0x95, 0x00, 0x00}, /* 'Е' = 0xC5 */
{0xD0, 0x96, 0x00, 0x00}, /* 'Ж' = 0xC6 */ {0xD0, 0x97, 0x00, 0x00}, /* 'З' = 0xC7 */
{0xD0, 0x98, 0x00, 0x00}, /* 'И' = 0xC8 */ {0xD0, 0x99, 0x00, 0x00}, /* 'Й' = 0xC9 */
{0xD0, 0x9A, 0x00, 0x00}, /* 'К' = 0xCA */ {0xD0, 0x9B, 0x00, 0x00}, /* 'Л' = 0xCB */
{0xD0, 0x9C, 0x00, 0x00}, /* 'М' = 0xCC */ {0xD0, 0x9D, 0x00, 0x00}, /* 'Н' = 0xCD */
{0xD0, 0x9E, 0x00, 0x00}, /* 'О' = 0xCE */ {0xD0, 0x9F, 0x00, 0x00}, /* 'П' = 0xCF */
{0xD0, 0xA0, 0x00, 0x00}, /* 'Р' = 0xD0 */ {0xD0, 0xA1, 0x00, 0x00}, /* 'С' = 0xD1 */
{0xD0, 0xA2, 0x00, 0x00}, /* 'Т' = 0xD2 */ {0xD0, 0xA3, 0x00, 0x00}, /* 'У' = 0xD3 */
{0xD0, 0xA4, 0x00, 0x00}, /* 'Ф' = 0xD4 */ {0xD0, 0xA5, 0x00, 0x00}, /* 'Х' = 0xD5 */
{0xD0, 0xA6, 0x00, 0x00}, /* 'Ц' = 0xD6 */ {0xD0, 0xA7, 0x00, 0x00}, /* 'Ч' = 0xD7 */
{0xD0, 0xA8, 0x00, 0x00}, /* 'Ш' = 0xD8 */ {0xD0, 0xA9, 0x00, 0x00}, /* 'Щ' = 0xD9 */
{0xD0, 0xAA, 0x00, 0x00}, /* 'Ъ' = 0xDA */ {0xD0, 0xAB, 0x00, 0x00}, /* 'Ы' = 0xDB */
{0xD0, 0xAC, 0x00, 0x00}, /* 'Ь' = 0xDC */ {0xD0, 0xAD, 0x00, 0x00}, /* 'Э' = 0xDD */
{0xD0, 0xAE, 0x00, 0x00}, /* 'Ю' = 0xDE */ {0xD0, 0xAF, 0x00, 0x00}, /* 'Я' = 0xDF */
{0xD0, 0xB0, 0x00, 0x00}, /* 'а' = 0xE0 */ {0xD0, 0xB1, 0x00, 0x00}, /* 'б' = 0xE1 */
{0xD0, 0xB2, 0x00, 0x00}, /* 'в' = 0xE2 */ {0xD0, 0xB3, 0x00, 0x00}, /* 'г' = 0xE3 */
{0xD0, 0xB4, 0x00, 0x00}, /* 'д' = 0xE4 */ {0xD0, 0xB5, 0x00, 0x00}, /* 'е' = 0xE5 */
{0xD0, 0xB6, 0x00, 0x00}, /* 'ж' = 0xE6 */ {0xD0, 0xB7, 0x00, 0x00}, /* 'з' = 0xE7 */
{0xD0, 0xB8, 0x00, 0x00}, /* 'и' = 0xE8 */ {0xD0, 0xB9, 0x00, 0x00}, /* 'й' = 0xE9 */
{0xD0, 0xBA, 0x00, 0x00}, /* 'к' = 0xEA */ {0xD0, 0xBB, 0x00, 0x00}, /* 'л' = 0xEB */
{0xD0, 0xBC, 0x00, 0x00}, /* 'м' = 0xEC */ {0xD0, 0xBD, 0x00, 0x00}, /* 'н' = 0xED */
{0xD0, 0xBE, 0x00, 0x00}, /* 'о' = 0xEE */ {0xD0, 0xBF, 0x00, 0x00}, /* 'п' = 0xEF */
{0xD1, 0x80, 0x00, 0x00}, /* 'р' = 0xF0 */ {0xD1, 0x81, 0x00, 0x00}, /* 'с' = 0xF1 */
{0xD1, 0x82, 0x00, 0x00}, /* 'т' = 0xF2 */ {0xD1, 0x83, 0x00, 0x00}, /* 'у' = 0xF3 */
{0xD1, 0x84, 0x00, 0x00}, /* 'ф' = 0xF4 */ {0xD1, 0x85, 0x00, 0x00}, /* 'х' = 0xF5 */
{0xD1, 0x86, 0x00, 0x00}, /* 'ц' = 0xF6 */ {0xD1, 0x87, 0x00, 0x00}, /* 'ч' = 0xF7 */
{0xD1, 0x88, 0x00, 0x00}, /* 'ш' = 0xF8 */ {0xD1, 0x89, 0x00, 0x00}, /* 'щ' = 0xF9 */
{0xD1, 0x8A, 0x00, 0x00}, /* 'ъ' = 0xFA */ {0xD1, 0x8B, 0x00, 0x00}, /* 'ы' = 0xFB */
{0xD1, 0x8C, 0x00, 0x00}, /* 'ь' = 0xFC */ {0xD1, 0x8D, 0x00, 0x00}, /* 'э' = 0xFD */
{0xD1, 0x8E, 0x00, 0x00}, /* 'ю' = 0xFE */ {0xD1, 0x8F, 0x00, 0x00}, /* 'я' = 0xFF */
};
typedef struct _Utf{
char ch;
char *pch;
} tUtf, *pUtf;
tUtf Utf2C2[] ={//Как нетрудно заметить, в этом массиве ключ и значение равны. Однако мы этот факт не учитываем
{0xA0, (char*)0xA0}, {0xA4, (char*)0xA4}, {0xA6, (char*)0xA6}, {0xA7, (char*)0xA7},
{0xA9, (char*)0xA9}, {0xAB, (char*)0xAB}, {0xAC, (char*)0xAC}, {0xAD, (char*)0xAD},
{0xAE, (char*)0xAE}, {0xB0, (char*)0xB0}, {0xB1, (char*)0xB1}, {0xB5, (char*)0xB5},
{0xB6, (char*)0xB6}, {0xB7, (char*)0xB7}, {0xBB, (char*)0xBB}, {0,0}
};
tUtf Utf2D0[] ={
{0x81, (char*)0xA8}, {0x82, (char*)0x80}, {0x83, (char*)0x81}, {0x84, (char*)0xAA},
{0x85, (char*)0xBD}, {0x86, (char*)0xB2}, {0x87, (char*)0xAF}, {0x88, (char*)0xA3},
{0x89, (char*)0x8A}, {0x8A, (char*)0x8C}, {0x8B, (char*)0x8E}, {0x8C, (char*)0x8D},
{0x8E, (char*)0xA1}, {0x8F, (char*)0x8F}, {0x90, (char*)0xC0}, {0x91, (char*)0xC1},
{0x92, (char*)0xC2}, {0x93, (char*)0xC3}, {0x94, (char*)0xC4}, {0x95, (char*)0xC5},
{0x96, (char*)0xC6}, {0x97, (char*)0xC7}, {0x98, (char*)0xC8}, {0x99, (char*)0xC9},
{0x9A, (char*)0xCA}, {0x9B, (char*)0xCB}, {0x9C, (char*)0xCC}, {0x9D, (char*)0xCD},
{0x9E, (char*)0xCE}, {0x9F, (char*)0xCF}, {0xA0, (char*)0xD0}, {0xA1, (char*)0xD1},
{0xA2, (char*)0xD2}, {0xA3, (char*)0xD3}, {0xA4, (char*)0xD4}, {0xA5, (char*)0xD5},
{0xA6, (char*)0xD6}, {0xA7, (char*)0xD7}, {0xA8, (char*)0xD8}, {0xA9, (char*)0xD9},
{0xAA, (char*)0xDA}, {0xAB, (char*)0xDB}, {0xAC, (char*)0xDC}, {0xAD, (char*)0xDD},
{0xAE, (char*)0xDE}, {0xAF, (char*)0xDF}, {0xB0, (char*)0xE0}, {0xB1, (char*)0xE1},
{0xB2, (char*)0xE2}, {0xB3, (char*)0xE3}, {0xB4, (char*)0xE4}, {0xB5, (char*)0xE5},
{0xB6, (char*)0xE6}, {0xB7, (char*)0xE7}, {0xB8, (char*)0xE8}, {0xB9, (char*)0xE9},
{0xBA, (char*)0xEA}, {0xBB, (char*)0xEB}, {0xBC, (char*)0xEC}, {0xBD, (char*)0xED},
{0xBE, (char*)0xEE}, {0xBF, (char*)0xEF}, {0,0}
};
tUtf Utf2D1[] ={
{0x80, (char*)0xF0}, {0x81, (char*)0xF1}, {0x82, (char*)0xF2}, {0x83, (char*)0xF3},
{0x84, (char*)0xF4}, {0x85, (char*)0xF5}, {0x86, (char*)0xF6}, {0x87, (char*)0xF7},
{0x88, (char*)0xF8}, {0x89, (char*)0xF9}, {0x8A, (char*)0xFA}, {0x8B, (char*)0xFB},
{0x8C, (char*)0xFC}, {0x8D, (char*)0xFD}, {0x8E, (char*)0xFE}, {0x8F, (char*)0xFF},
{0x91, (char*)0xB8}, {0x92, (char*)0x90}, {0x93, (char*)0x83}, {0x94, (char*)0xBA},
{0x95, (char*)0xBE}, {0x96, (char*)0xB3}, {0x97, (char*)0xBF}, {0x98, (char*)0xBC},
{0x99, (char*)0x9A}, {0x9A, (char*)0x9C}, {0x9B, (char*)0x9E}, {0x9C, (char*)0x9D},
{0x9E, (char*)0xA2}, {0x9F, (char*)0x9F}, {0,0}
};
tUtf Utf2D2[] ={{0x90, (char*)0xA5}, {0x91, (char*)0xB4}, {0,0}};
tUtf Utf3E280[] ={
{0x93, (char*)0x96}, {0x94, (char*)0x97}, {0x98, (char*)0x91}, {0x99, (char*)0x92},
{0x9A, (char*)0x82}, {0x9C, (char*)0x93}, {0x9D, (char*)0x94}, {0x9E, (char*)0x84},
{0xA0, (char*)0x86}, {0xA1, (char*)0x87}, {0xA2, (char*)0x95}, {0xA6, (char*)0x85},
{0xB0, (char*)0x89}, {0xB9, (char*)0x8B}, {0xBA, (char*)0x9B}, {0,0}
};
tUtf Utf3E282[] ={{0xAC, (char*)0x88}, {0,0}};
tUtf Utf3E284[] ={{0x96, (char*)0xB9}, {0xA2, (char*)0x99}, {0,0}};
tUtf Utf2E2[] ={{0x80, (char*)&Utf3E280}, {0x82, (char*)&Utf3E282}, {0x84, (char*)&Utf3E284}, {0,0}};
tUtf Utf3EFBF[] ={{0xBD, (char*)0x98}, {0,0}};
tUtf Utf2EF[] ={{0xBF, (char*)&Utf3EFBF}, {0,0}};
tUtf Utf1[] ={
{0xC2, (char*)&Utf2C2}, {0xD0, (char*)&Utf2D0}, {0xD1, (char*)&Utf2D1}, {0xD2, (char*)&Utf2D2},
{0xE2, (char*)&Utf2E2}, {0xEF, (char*)&Utf2EF}, {0,0}
};
#pragma pack(pop)
void __declspec(naked) AsciiToUtf8(char *ac){
__asm{
push ebp
mov ebp, esp
mov edx, [ac] //lea esi, [ac]
push edx
push edx
call lenc
//1 add esp, 4 //подчищаем за функцией: __cdecl lenc
pop esi
sub esp, eax //резервируем место в стеке
//1 sub esp, 4 //доп. резерв
mov edi, esp
push edi
push esi
mov ecx, eax
shr ecx, 2
inc ecx
rep movsd
pop edi
pop esi
mov eax, 0
cld
atu1: lodsb
test al, 80h
jz atue
sub al, 80h
mov ah, 0
shl eax, 2
lea edx, [chartoutf + eax]
xchg edx, esi
atu2: lodsb
or al, al
jnz atu3
mov esi, edx
jmp atu1
atu3: stosb
jmp atu2
atue: stosb
or al, al
jnz atu1
mov esp, ebp
pop ebp
retn
}
}
void __declspec(naked) Utf8ToAscii(char *ac){
// В работе данной функции используется тот факт что адресное пространство приложений в Windows начинается с адреса 0x00400000
__asm{
push ebp
mov ebp, esp
mov esi, [ac] // lea esi, [ac]
mov edi, esi
cld
uta1: lodsb
test al, 128
jz utae
mov ah, al
and ah, 224
cmp ah, 192
jne uta2
mov ecx, 1
jmp uta4
uta2: mov ah, al
and ah, 240
cmp ah, 224
jne uta3
mov ecx, 2
jmp uta4
uta3: mov ah, al
and ah, 248
cmp ah, 240
jne utae // not utf8, but > 80h
mov ecx, 3
uta4: lea ebx, Utf1 // lea ebx, Utf1
uta5: cmp al, [ebx]
jne uta7
mov ebx, [ebx + 1]
cmp ebx, 10000h // it is fact described above )
ja uta6
mov al, bl
jmp utae
uta6: lodsb
dec ecx
jns uta5
mov ecx, 0
jmp uta8
uta7: add ebx, 5
cmp byte ptr [ebx], 0
jnz uta5
uta8: mov al, '-' //default char for unknown utf8 code
add esi, ecx
utae: stosb
or al, al
jnz uta1
mov esp, ebp
pop ebp
retn
}
} |