forked from CTeX-org/ctex-kit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gbk2uni.c
667 lines (617 loc) · 25.5 KB
/
gbk2uni.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
/* $Id: gbk2uni.c,v 1.3 2005/03/21 02:44:41 zlb Exp $ */
/* */
/* gbk2uni.cpp : Transform GBK characters in .out file to unicode codes. */
/* the initial code is from out2uni in dvipdfmx project of KTUG */
/* authors: cxterm and Linbo Zhang in 2003 */
/* reach them at http://www.ctex.org */
/* enhancer: hooklee (Shujun Li) in 2003 */
/* reach hooklee at http://www.hooklee.com or www.chinatex.org */
/* ===================================================== */
/* ********************hyperref书签文件编码规则************************** */
/* 每个书签以如下形式存放 :\BOOKMARK [1][-]{section.0.1}{书签正文}{} */
/* 非unicode模式下使用hyperref宏包,bookmark中的部分特殊字符以\ooo的形式插入 */
/* ' ':\040, '#':\043, '$':\044, '%':\045, '&':\046, '\':\134, '^':\136, '_':\137, '{':\173, '}':\175, '~':176 */
/* 比较特殊的是'('和')',是以'\('和'\)'的形式插入的,而不是\ooo形式 */
/* \S:\247 */
/* 所有其他字符和汉字均不作任何处理,在bookmark中保留 */
/* 已经知道,这种保留会造成部分汉字在bookmark中无法显示 */
/* 当使用\CJKchar{"0081}{"040}方式直接以GBK代码的方法插入汉字,bookmark中会生成如下的书签代码: */
/* "0081"040,显然,pdflatex忽略了\CJKchar命令本身和前后的{}把参数当做普通文本做了转换 */
/* '^^xx^^yy'形式的CJK汉字在.out中有两种可能的出现方式:'^^xx^^yy'和'^^xxL' */
/* ===================================================== */
/* unicode模式下使用hyperref宏包,bookmark特殊字符均编码为\ooo\ooo或者\000x或者\000x\80y形式的unicode代码 */
/* 书签内容均以\376\377开头作为前导标示符 */
/* 经过实验,相应的bookmarkunicode代码插入规则如下: */
/* *****A类:编码为\ooo\ooo的特殊字符部分***** */
/* ' '(空格):\000\040,使用\textvisiblespace也得到同样的书签 */
/* '#'(\#):\000\043, '$'(\$):\000\044, '%'(\%):\000\045, '&'(\&):\000\046 */
/* '(':\000\050; ')':\000\051 */
/* '\'(\textbackslash):\000\134; */
/* '^'(\textasciicircum):\000\136; '_'(\_):\000\137 */
/* '{'(\{):\000\173; '}'(\}):\000\175 */
/* '~'(\textasciitilde):\000\176; */
/* *****B类:编码为\000x的普通字符部分,其中x表示字符本身***** */
/* abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 */
/* |:',./!?;"-+=[]`*@(直接用@即可,无需\@)<(或\textless)>(或\textgreater) */
/* *****C类:单个汉字***** */
/* 假设其高位码为H,低位码为L,则一般的GB汉字其插入形式为:\000H\80L */
/* 但是上述情况存在例外,当L为普通拉丁字符时,将会以\000HL的形式插入 */
/* 如果任何汉字出现在一个低位为拉丁字符的GBK汉字之后,第二个汉字会以\80H\000L的形式出现 */
/* 只有\80HL是不可能出现的汉字代码 */
/* 当书签中包含多个汉字的时候,重复按照上述规则插入,汉字中间的其他字符按照正常规则插入 */
/* ===================================================== */
/* @注意:当汉字低位字节为字符'}{~\_^'时,tex文档编译会出现错误,强行编译可能出现不可预测的行为 */
/* @插入out文件的内容变得很混乱,一般书签正文会在低位'}'出现之后终止,gbk2uni只尽可能地消除这种影响 */
/* @这可能使得部分GBK汉字在书签中消失或者显示为其他字符 */
/* @使用张林波老师随CCT新版发行的cctconv程序可以解决这个问题 */
/* @cctconv把汉字低位字节为'\', '{', '}', '^', '_', '~'的汉字低位字节分别改为'012345'以方便处理 */
/* @或者使用-f开关转换可以将所有高位为1的字符转换为^^xx的形式,这在一些老的不支持扩展字符的tex系统中有用 */
/* @gbk2uni处理这样的汉字假设cctconv已经运行(cctconv与CJK兼容,无需cct.sty即可得到正确的dvi文件) */
/* @这样的汉字经cctconv处理后,插入out文件的对应内容在unicode模式下有两种可能: */
/* @sprintf("\\000%d\\%03o", H, L)和sprintf("%d\\%03o", H, L),这里L已经是被转换回来的'}{~\_^' */
/* 现在还不清楚是否也有sprintf("\\80%d\\%03o", H, L)形式出现(根据'\80HL'不出现推测这种形式可能也不出现) */
/* ===================================================== */
/* *****D类:\CJKchar{"00ab}{"0cd}形式的CJK汉字***** */
/* \000"\0000\0000\000a\000b\000"\0000\000c\000d */
/* 显然,unicode模式的hyperref是如下工作的: */
/* 第一步先生成非unicode模式的out文件,接着对其中的扩展字符做了一个后处理,但是这个处理对汉字不正确 */
/* *****E类:'^^xx^^yy'形式的CJK汉字***** */
/* 在unicode模式下,.out文件的内容一般为'\000^^xx\80^^yy'或者'\000^^xx\80L' */
/* 估计其他CJK汉字的样式也可能出现:'\80^^xx\000^^yy'、'\80^^xx\000L'、'\000^^xx^^yy'、'\000^^xx\ooo' */
/* 以上情况说明,在处理过程中,我们可以像TeX那样将每一个'^^xx'码字当成普通的ASCII字符来处理即可兼容'^^xx'代码 */
/* ===================================================== */
/* 如果\CJKchar{}{}中的前后两个参数位数不同,单从.out文件无法判断第二个参数何时结束 */
/* 因此,gbk2uni要求在tex文档中统一采用三位十六进制法\CJKchar{"0xx"}{"0xx}表示前后两个参数 */
/* 考虑到在实际中\CJKchar用的比较少,这个约定应该不会算大的限制。 */
/* ===================================================== */
#define VERSION "0.22"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#if defined(WIN32) && !defined(__MINGW32__)
# include <io.h>
# define PATH_MAX _MAX_PATH+1
#else
# include <unistd.h>
# include <limits.h>
#endif
#include "gbk2uni.h"
#ifndef WIN32
# define _fileno(f) f
static size_t _filelength(FILE *f)
{
size_t pos = ftell(f);
size_t length;
fseek(f, 0, SEEK_END);
length = ftell(f);
fseek(f, pos, SEEK_SET);
return length;
}
#endif
#if !defined(WIN32) && !defined(GO32)
static int strnicmp (const char *s0, const char *s1, int n)
{
int i;
while (n-- > 0 && *s0 != '\0' && *s1 != '\0') {
i = toupper(*(s0++)) - toupper(*(s1++));
if (i) return i;
}
return n > 0 ? toupper(*s0) - toupper(*s1) : 0;
}
#endif
#define BYTE unsigned char
#define DWORD unsigned int
/* is a valid high byte of some GBK character */
#define GBK_HIGH(h) ((0x81<=(h&0xff)) && ((h&0xff)<=0xfe))
/* is a valid low byte of some GBK character */
#define GBK_LOW(l) ((0x40<=(l&0xff)) && ((l&0xff)<=0xfe))
int bLock = 0;/* lock */
int bUnlockOnly = 0;
int bCJKchar = 1;/* enable \CJKchar support defaultly, disable it with '-nc' option */
/* int bIgnoreCJK7 = 0; */
int bParsingErrors = 1;
/* int bVerbose = 0; */
int bSilent = 0;
FILE *Fout;
FILE *Fin;
void version(void)
{
printf("gbk2uni, version "VERSION", initially implemented by cxterm and ZLB in Jan. 2003\n");
printf("\t enhanced by hooklee in Mar. 2003.\n");
printf("\t please visit www.ctex.org and www.chinatex.org for more information.\n");
}
/* print usage of gbk2uni */
void usage(void)
{
version();
printf("Usage : gbk2uni [options] filename[.out] [options]\n");
printf("Options:\n");
printf("\t-u(-l)\t lock .out file to avoid overwritten in the next (pdf)latex run\n");
printf("\t\t (.out file will be unlocked if no '-u' and '-l' options)\n");
printf("\t-unlock\t unlock .out file without parsing .out file\n");
/* printf("\t-i\t ignore all CJK characters with \"^^xx^^yy\" format\n"); */
printf("\t-s\t run gbk2uni silently (but errors remain)\n");
printf("\t-cjk\t parse \\CJKchar{\"0xx}{\"0xx} command (default)\n");
printf("\t-nocjk\t disable parsing \\CJKchar{\"0xx}{\"0xx} command\n");
printf("\t-npe\t disable display of all parsing errors\n");
}
/* write unicode into the file Fout */
void putucode(unsigned int u)
{
unsigned int h, l;
l = u & 0xff;
h = (u >> 8) & 0xff;
fprintf(Fout,"%c%03o%c%03o",'\\',h,'\\',l);
}
/* put a GBK code */
void putGBKcode(BYTE h, BYTE l,int nLine)
{
unsigned int u;
unsigned int hu, lu;
if (!GBK_HIGH(h) || !GBK_LOW(l)) {
/* if current GBK character is not valid, it will be discarded */
if (bParsingErrors)
fprintf (stderr, "An invalid GBK character is found:\n\tLine %d: ... 0x%x%x\n", nLine, h,l);
return;
}
u = gbk2uni[(h-0x81)*192 + (l-0x40)];
lu = u & 0xff;
hu = (u >> 8) & 0xff;
fprintf(Fout,"\\%03o\\%03o",hu,lu);
}
/* is a character c '0'...'9','a'...'f','A'...'F'? */
int is8digit(char c)
{
return (c>='0' && c<='7') ? 1 : 0;
}
/* is a character c '0'...'9','a'...'f','A'...'F'? */
int is16digit(char c)
{
return (isdigit(c)) || (c>='a' && c<='f') || (c>='A' && c<='F') ? 1 : 0;
}
/* 3-digit octal string to decimal number */
unsigned char otoi(char *str)
{
return 64*(*str-'0') + 8*(*(str+1)-'0') + (*(str+2)-'0');
}
/* 2-digit hexadecimal string to decimal number */
unsigned char xtoi(char *str)
{
unsigned char h,l;
h = (unsigned char)tolower(*str);
l = (unsigned char)tolower(*(str+1));
if (isdigit(h)) h = h - '0';
else h = h - 'a' + 10;
if (isdigit(l)) l = l - '0';
else l = l - 'a' + 10;
return 16*h + l;
}
/* parse '\000"\000x\000x' generated by \CJKchar{}{} command */
/* this function is used to skip the leading string '\000' */
int getCJKchar(char **str,int nLine)
{
int i;
while(**str!='\\' && **str != '\0' && **str != '}') (*str)++;/* find the next '\\' */
if (**str == '}' || **str == '\0') return 0;
(*str)++;
for (i=0; i < 3; i++) {
if(**str != '0') break;
(*str)++;
}
if (i != 3) {
if (bParsingErrors)
fprintf (stderr, "An incomplete \\CJKchar{}{} command is found:\n\tLine %d: ... \"%s\"\n", nLine, (*str)-i-1);
return -1;
}
return 1;
}
/* translate a '^^xx'-format TeX character to an ascii character */
/* if not a '^^xx'-format TeX character, return itself */
int translateChar(char **str,int nLine)
{
BYTE a;
if ( **str != '^') {
a = **str;
if ( **str != '}' && **str != '\0' ) (*str)++;
return a;/* if not '^^xx' directly return the current character */
}
while(**str == '^') (*str)++;/* skip all '^' characters */
if ( is16digit(**str) && is16digit(*(*str+1)) ) {
a = xtoi(*str);
(*str) = *str + 2;
return a;
}
else {
if (bParsingErrors)
fprintf (stderr, "An incomplete '^^xx' TeX character is found:\n\tLine %d: ... \"%s\"\n", nLine, (*str)-2);
return -1;/* -1L = 0xffffffff */
}
}
/* parse the bookmark and generate corresponding unicode codes */
char *doparse(char *str,BYTE bUnicode,int nLine)
{
BYTE lh,h,l;/* h denotes high byte and l denotes low byte of a unicode character,lh denotes the leading '\ooo' */
int i, rtn;
char strCode[4];
/* skip to the next valid character... needed or not? */
while(1) {
if(bUnicode)
while(*str==' ' || *str=='\t' || *str=='\n' || *str=='\r') str++;
else
while(*str=='\t' || *str=='\n' || *str=='\r') str++;
if ( *str == '}' || *str == '\0') return str;/* end */
switch(*str) {
case '\\':
/* original unicode codes generated by pdflatex, including '\(' and '\)' */
/* note: '\oo' and '\par' may occur in wrongly-complied tex document */
while(*str == '\\') str++;/* occasionally double '\' may occur in a wrong .out file */
/* processing '\ooo' in non-unicode mode */
if(!bUnicode) {
/* processing '\(' and '\)' in non-unicode */
if ( *str == '(' || *str == ')') {
fprintf(Fout,"\\000\\%03o", *str++);
break;
}
/* in non-unicode mode, '\ooo' is possible for special latin character, such as '\S' */
if (isdigit(*str)) {
fprintf(Fout,"\\000\\");/* add '\000' prefix to current special unicode character */
i=0;
while(1) {
fputc(*str++,Fout);i++;
if(!isdigit(*str) || i >=3) break;
}/* to avoid less than three digital characters after '\' */
/* fwrite(Fount,1,3,str);str+=3; */
if ( i < 3 && bParsingErrors)
fprintf (stderr, "An incomplete special character is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-1);
}
break;
}
/* processing '\ooo\ooo' or '\000x' or '\000H\80L' or '\000HL' or '\80H\000L' in unicode mode */
/* here please note that either 'H' or 'L' or both two can be '^^xx'-format */
if (isdigit(*str)) {
/* in unicode mode, '\ooo\ooo' and '\000x' and '\000H\80L' are all possible for different characters */
/* possibly, '\oo' should be taken into consideration to avoid possible collapse of gbk2uni */
strCode[0]=*str++;
for (i=1; i<3; i++) {
if(isdigit(*str)) strCode[i]=*str++;
else break;
}
strCode[i]='\0';
if (i == 1) {
if (bParsingErrors)
fprintf (stderr, "An incomplete special unicode code is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-1);
break;
}
lh = atoi(strCode);/* get the high byte of current unicode character */
if(i ==3 && *str == '\\') {/* '\ooo\ooo': normal unicode character */
fprintf(Fout, "\\%s\\", strCode);/* directly output leading '\ooo\' */
str++;
for (i=0; i<3; i++) {
if(isdigit(*str)) strCode[i]=*str++;/* directly output the left 'ooo' */
else break;
}
strCode[i]='\0';
if (i < 3) {
if (bParsingErrors)
fprintf (stderr, "An incomplete unicode code is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-1);
}
else fprintf(Fout, "%s", strCode);/* directly output the left 'ooo' */
break;
}
if (lh == 0 && *str == '\"' && bCJKchar) {/* \CJKchar{"0xx}{"0xx} command in unicode mode */
rtn = getCJKchar (&str, nLine);
if(rtn == 0) return str;
if(rtn == -1) break;/* skip the first '\0000' */
rtn = getCJKchar (&str, nLine);
if(rtn == 0) return str;
if(rtn == -1) break;
strCode[0] = *str++;/* get the first digit of high byte */
rtn = getCJKchar (&str, nLine);
if(rtn == 0) return str;
if(rtn == -1) break;
strCode[1] = *str++;/* get the second digit of high byte */
h = xtoi (strCode);/* get high byte */
rtn = getCJKchar (&str, nLine);
if(rtn == 0) return str;
if(rtn == -1) break;
if (*str != '\"') {/* is the third unicode code '"'? */
if (bParsingErrors)
fprintf (stderr, "An incomplete \\CJKchar{}{} command is found:\n\tLine %d: ... \"%s\"\n", nLine, str-4);
break;
}
rtn = getCJKchar (&str, nLine);
if(rtn == 0) return str;
if(rtn == -1) break;/* skip the second '\0000' */
rtn = getCJKchar (&str, nLine);
if(rtn == 0) return str;
if(rtn == -1) break;
strCode[0] = *str++;/* get the first digit of low byte */
rtn = getCJKchar (&str, nLine);
if(rtn == 0) return str;
if(rtn == -1) break;
strCode[1] = *str++;/* get the second digit of low byte */
l = xtoi (strCode);/* get low byte */
putGBKcode(h, l, nLine);/* put unicode code via GBK2UNICODE transformation */
break;
}
/* '\000x' or '\000H\80L' or '\80H\000L' or '\000H\ooo' */
rtn = translateChar(&str,nLine);
if (rtn == -1) break;/* break when encountering errors */
else h = (BYTE) rtn;
if (lh == 0 && h != 0 && h != '}' && h < 0x80) {/* '\000x' format remains */
/* translate '\000x' to '\000\ooo' to get more robust result */
fprintf(Fout, "\\000\\%03o", h);
break;
}
if (lh == 0 && h > 0x80) {/* '\000HL' or '\000H\80L' or '\000H\ooo' */
/* h = (BYTE) *str++;*/ /*set high GBK byte */
if ( *str != '\\') {/* '\000HL' */
rtn = translateChar(&str,nLine);
if (rtn == -1) break;/* break when encountering errors */
else l = (BYTE) rtn;
/* l = *str++; */
putGBKcode(h, l, nLine);/* put unicode code via GBK2UNICODE transformation */
}
else {/* '\000H\80L' or '\000H\ooo' */
if (*(str+1) == '8' && *(str+2) == '0') {/* is '80L' after '\'? */
str += 3;
rtn = translateChar(&str,nLine);
if (rtn == -1) break;/* break when encountering errors */
else l = (BYTE) rtn;
/* l = *str++;*/ /*set low GBK byte */
putGBKcode(h, l, nLine);/* put unicode code via GBK2UNICODE transformation */
}
else if ( is8digit(*(str+1)) && is8digit(*(str+2)) && is8digit(*(str+3)) ) {
l = otoi(str+1);
putGBKcode(h, l, nLine);/* put a GBK code */
#if 0
if (l == '{' || l == '}' || l == '\\' || l == '^' || l == '_' || l == '~' || l == 0x80)
putGBKcode(h, l, nLine);/* put a GBK code */
else if (bParsingErrors)
fprintf (stderr, "An invalid GBK character (in cctconv format) is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-2);
#endif
str += 4;
}
else if (bParsingErrors)
fprintf (stderr, "An incomplete GBK character is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-1);
}
break;
}
if (lh == 80 && h > 0x80) {/* '\80H\000L' */
/* h = (BYTE) *str++;*/ /*set high GBK byte */
if (*str == '\\' && *(str+1) == '0' && *(str+2) == '0' && *(str+3) == '0') {/* is '\000L' after '\80H'? */
str += 4;
rtn = translateChar(&str,nLine);
if (rtn == -1) break;/* break when encountering errors */
else l = (BYTE) rtn;
/* l = *str++;*/ /*set low GBK byte */
putGBKcode(h, l, nLine);/* put unicode code via GBK2UNICODE transformation */
}
else if (bParsingErrors)
fprintf (stderr, "An incomplete GBK character is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-1);
break;
}
if (h == 80 && *str > 0) {/* is '\80x' possible? */
if(*str != '}' && *str != '\0') str++;/* goto the next code */
}
break;
}
/* remove '\par' from .out file */
if(*str == 'p' && *(str+1) == 'a' && *(str+2) == 'r') str+=3;
break;
#if 0
case '^':/* GBK characters with CJK format '^^xx^^yy' */
while(*str=='^') str++;/* skip all '^' characters */
if (is16digit(*str) && is16digit(*(str+1))) {
h = xtoi(str); str += 2;
while(*str=='^') str++;/* skip all '^' characters */
if (is16digit(*str) && is16digit(*(str+1))) {
l = xtoi(str); str += 2;
if (!bIgnoreCJK7) putGBKcode(h, l, nLine);/* put a GBK code if not ignoring */
}
else if (*str < 0)
else if (bParsingErrors)
fprintf (stderr, "An incomplete GBK character (in CJK format) is found:\n\tLine %d: ... \"^^%s\"\n", nLine, str-2);
}
else if (bParsingErrors)
fprintf (stderr, "An incomplete GBK character (in CJK format) is found:\n\tLine %d: ... \"^^%s\"\n", nLine, str);
break;
#endif
case '\"':/* \CJKchar{"0xx}{"0xx} command in non-unicode mode? */
if (!bCJKchar) fprintf(Fout, "\\000\\%03o", *str++);/* normal '"' character in non-unicode mode */
else {/* \CJKchar{"0xx}{"0xx} command in non-unicode mode */
while(*str=='\"') str++;/* skip all '"' characters */
if (*str == '0') str++;/* skip the first '0' */
if (is16digit(*str) && is16digit(*(str+1))) {
h = xtoi(str); str += 2;
while(*str=='\"') str++;/* skip all '"' characters */
if (*str == '0') str++;/* skip the second '0' */
if (is16digit(*str) && is16digit(*(str+1))) {
l = xtoi(str); str += 2;
putGBKcode(h, l, nLine);/* put a GBK code */
}
else if (bParsingErrors)
fprintf (stderr, "An incomplete GBK character (in \\CJKchar{}{} format) is found:\n\tLine %d: ... \"%s\"\n", nLine, str-2);
}
else if (bParsingErrors)
fprintf (stderr, "An incomplete GBK character (in \\CJKchar{}{} format) is found:\n\tLine %d: ... \"%s\"\n", nLine, str);
}
break;
default:/* normal characters in non-unicode mode or cctconv GBK characters in both mode */
/* or '^^xx^^yy'/'^^xxL' TeX characters */
rtn = translateChar(&str,nLine);
if (rtn == -1) break;/* break when encountering errors */
else h = (BYTE) rtn;
if (h != 0 && h != '}' && h < 0x80) fprintf(Fout, "\\000\\%03o", h);/* normal latin character */
else {/* GBK character */
/* h = *str++;*/ /*GBK high byte */
if (*str == '\\') {/* characters generated by cctconv */
str++;
for(i = 0; i < 3; i++) {
if(is8digit(*str)) strCode[i] = *str++;
else break;
}
strCode[i] = '\0';
if ( i != 3) {
if (bParsingErrors)
fprintf (stderr, "An incomplete GBK character (in cctconv format) is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-2);
break;
}
l = otoi(strCode);/* get low byte from '\ooo' */
putGBKcode(h, l, nLine);/* put a GBK code */
#if 0
if (l == '{' || l == '}' || l == '\\' || l == '^' || l == '_' || l == '~')
putGBKcode(h, l, nLine);/* put a GBK code */
else if (bParsingErrors)
fprintf (stderr, "An invalid GBK character (in cctconv format) is found:\n\tLine %d: ... \"%s\"\n", nLine, str-i-2);
#endif
break;
}
/* low byte of a normal CJK character or '^^yy' */
rtn = translateChar(&str,nLine);
if (rtn == -1) break;/* break when encountering errors */
else l = (BYTE) rtn;
switch(l) {
case '}':
case '\0':
if (bParsingErrors)
fprintf (stderr, "An incomplete GBK character (in '^^xx^^yy' format) is found:\n\tLine %d: ... \"%s\"\n", nLine, str);
return str;/* SHOULD exit when reading '\0' or '}' */
case '0': l = '\\'; break;/* reserved for future CCT */
case '1': l = '{'; break;/* reserved for future CCT */
case '2': l = '}'; break;/* reserved for future CCT */
case '3': l = '^'; break;/* reserved for future CCT */
case '4': l = '_'; break;/* reserved for future CCT */
case '5': l = '~'; break;/* reserved for future CCT */
case '6': l = 0x80; break;/* reserved for future CCT */
case '7': l = '|'; break;/* reserved for future CCT */
default: ;/* normal GBK character or '^^yy' */
}
putGBKcode(h, l, nLine);/* put a GBK code */
/* str++; */
}
}
}
}
int main(int argc, char* argv[])
{
char inname[PATH_MAX]="";
char outname[PATH_MAX]="";
char bakname[PATH_MAX]="";
char *p;
unsigned int nLength;
char *b_in,*b2_in,*b3_in;
BYTE bUnicode=0;
int nLine, i;
for(i=1; i < argc; i++) {
#if defined(WIN32) || defined(GO32)
strlwr(argv[i]);
#endif
if (!strcmp(argv[i], "-u") || !strcmp(argv[i], "-l")) {
bLock = 1; if (bUnlockOnly) bUnlockOnly = 0;
}
/* else if (!strcmp(argv[i], "-i")) bIgnoreCJK7 = 1; */
/* else if (!strcmp(argv[i], "-v")) bVerbose = 1; */
else if (!strcmp(argv[i], "-s")) bSilent = 1;
else if (!strcmp(argv[i], "-cjk")) bCJKchar = 1;
else if (!strcmp(argv[i], "-nocjk")) bCJKchar = 0;
else if (!strcmp(argv[i], "-unlock")) {
bUnlockOnly = 1; if (bLock) bLock = 0;
}
else if (!strcmp(argv[i], "-npe")) bParsingErrors = 0;
else strcpy(inname, argv[i]);
}
if(inname[0] == '\0') {
usage();
return 1;
}
if (!bSilent) version();/* display version and developer information */
p = strrchr(inname, '.');
#ifdef WIN32
if((p == NULL) || stricmp(p, ".out")) strcat(inname, ".out");
#else
if((p == NULL) || strcmp(p, ".out")) strcat(inname, ".out");
#endif
strcpy(outname, inname);
strcat(outname, ".tmp");
Fin = fopen(inname, "r");
if(!Fin) {
fprintf(stderr, "Cannot open %s to read!\n", inname);
exit(1);
}
nLength = _filelength (_fileno(Fin));
if (nLength <= 0) {
fprintf(stderr, "Cannot get the file size of %s!\n", inname);
fclose (Fin); exit(1);
}
Fout = fopen(outname, "wt");
if(!Fout) {
fprintf(stderr, "Cannot open %s to write!\n", outname);
fclose (Fin); exit(1);
}
b_in = (char *) malloc(nLength);
if(!b_in) {
fprintf(stderr, "Memory allocation error!\n");
fclose (Fin); fclose (Fout); exit (2);
}
if(bLock) {
fprintf(Fout,"\\let\\WriteBookmarks\\relax\n");
}
nLine = 0;
while(!feof(Fin))
{
if (fgets(b_in, nLength, Fin) == NULL) break;
nLine++;
/* if '\let\WriteBookmarks\relax' is found, skip the current line */
if (strstr(b_in,"\\let\\WriteBookmarks\\relax\n")) continue;
if (bUnlockOnly) {/* only unlock .out file when '-unlock' option is set */
fputs(b_in, Fout);
continue;
}
b2_in = b_in;
while( (*b2_in==' ' || *b2_in=='\n' || *b2_in=='\r' || *b2_in=='\t') && *b2_in!='\0' ) b2_in++;
if ( *b2_in == '\0') {
/* if (bParsingErrors) */
/* fprintf(stderr, "Warning: No bookmark content is found:\n\tLine %d: \"%s\"\n", nLine, b_in); */
/* fputs(b_in,Fout);*/ /* simply copy the wong line into new .out file */
continue;
}
if (strnicmp(b2_in, "\\BOOKMARK",9)) {/* skip invalid line in .out file */
if (bParsingErrors)
fprintf(stderr, "Invalid line is found:\n\tLine %d: \"%s\"\n", nLine, b_in);
/* fputs(b_in,Fout);*/ /* simply copy the wong line into new .out file */
continue;
}
/* find the position of the second parentheses */
/* '{' and '}' are displayed as '\173' and '\175' in .out file */
/* so there is no nested parentheses */
while(*b2_in!='}' && *b2_in!='\0' && *b2_in!='\n' && *b2_in!='\r' && *b2_in!='\t') b2_in++;
while(*b2_in!='{' && *b2_in!='\0' && *b2_in!='\n' && *b2_in!='\r' && *b2_in!='\t') b2_in++;
if ( *b2_in == '\0') {
if (bParsingErrors)
fprintf(stderr, "Warning: No bookmark content is found:\n\tLine %d: \"%s\"\n", nLine, b_in);
/* fputs(b_in,Fout);*/ /* simply copy the wong line into new .out file */
continue;
}
b2_in ++;
fwrite ( b_in, 1, b2_in-b_in, Fout);/* copy the left part of the new line */
/* Set unicode flag and skip the leading characters if encountering '\376\377' */
if( !strncmp(b2_in, "\\376\\377", 8) ) {
b2_in += 8; bUnicode = 1;
}
fputs("\\376\\377",Fout);/* set .out file to unicode format in non-unicode mode */
b3_in=doparse(b2_in, bUnicode, nLine);/* parse the middle part and write the new unicode codes */
fputs(b3_in,Fout);/* copy the right part of the new line */
}
free (b_in);
fclose(Fin);
fclose(Fout);
sprintf(bakname, "%s.bak",inname);
remove(bakname);
rename(inname,bakname);
rename(outname,inname);
if (!bSilent) fprintf(stdout, "gbk2uni %s is finished!\n",inname);
/* getchar(); */
return 0;
}