-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdemo_add_n_llm.py
1346 lines (1292 loc) · 60.4 KB
/
demo_add_n_llm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
AnyText: Multilingual Visual Text Generation And Editing
Paper: https://arxiv.org/abs/2311.03054
Code: https://github.com/tyxsspa/AnyText
Copyright (c) Alibaba, Inc. and its affiliates.
"""
import datetime
import enum
import os
import torch
from modelscope.pipelines import pipeline
import cv2
import gradio as gr
import random
import numpy as np
import re
import json
import argparse
from PIL import ImageFont
from gradio.components import Component
from pytorch_lightning import seed_everything
import einops
import time
from PIL import Image
from mllm import GPT4_maskGen, get_pos_n_glyph
from util import check_channels, resize_image, save_images, arr2tensor,check_overlap_polygon,get_lines,count_lines,sep_mask,draw_pos
from t3_dataset import draw_glyph, draw_glyph2, draw_glyph3, draw_glyph4, get_caption_pos
BBOX_MAX_NUM = 8
PLACE_HOLDER = "*"
save_memory = False
load_model = True
max_chars = 20
max_lines = 20
phrase_list = [
", content and position of the texts are ",
", textual material depicted in the image are ",
", texts that says ",
", captions shown in the snapshot are ",
", with the words of ",
", that reads ",
", the written materials on the picture: ",
", these texts are written on it: ",
", captions are ",
", content of the text in the graphic is ",
]
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--use_fp32",
action="store_true",
default=False,
help="Whether or not to use fp32 during inference.",
)
parser.add_argument(
"--no_translator",
action="store_true",
default=False,
help="Whether or not to use the CH->EN translator, which enable input Chinese prompt and cause ~4GB VRAM.",
)
parser.add_argument(
"--font_path",
type=str,
default="font/Arial_Unicode.ttf",
help="path of a font file",
)
parser.add_argument(
"--model_path",
type=str,
default="/data/hugging_face_models/cv_anytext_text_generation_editing/anytext_v1.1.ckpt",
help="load a specified anytext checkpoint",
)
parser.add_argument(
"--config_yaml",
type=str,
default="./models_yaml/anytext_sd15.yaml",
)
parser.add_argument(
"--save_path",
type=str,
default="Result/masa",
help="load a specified anytext checkpoint",
)
args = parser.parse_args()
return args
args = parse_args()
img_save_folder = args.save_path
infer_params = {
"model": "damo/cv_anytext_text_generation_editing",
"model_revision": "v1.1.3",
"use_fp16": not args.use_fp32,
"use_translator": not args.no_translator,
"font_path": args.font_path,
}
if args.model_path:
infer_params["model_path"] = args.model_path
if load_model:
# inference = pipeline("my-anytext-task", **infer_params)
from cldm.model import create_model, load_state_dict
from cldm.ddim_hacked import DDIMSampler
from cldm_mod.ddim_add import myDDIMSampler
model = create_model(args.config_yaml).cuda()
# print(model.cond_stage_model.transformer.text_model.embeddings)
model.load_state_dict(load_state_dict(args.model_path, location="cuda"),strict=True)
# print(model.control_model.input_blocks.state_dict().keys())
# print(model.control_model.input_blocks[5][1].transformer_blocks[0])
font = ImageFont.truetype("./font/Arial_Unicode.ttf", size=60)
def generate_rectangles(w, h, n, max_trys=200):
img = np.zeros((h, w, 1), dtype=np.uint8)
rectangles = []
attempts = 0
n_pass = 0
low_edge = int(max(w, h) * 0.3 if n <= 3 else max(w, h) * 0.2) # ~150, ~100
while attempts < max_trys:
rect_w = min(np.random.randint(max((w * 0.5) // n, low_edge), w), int(w * 0.8))
ratio = np.random.uniform(4, 10)
rect_h = max(low_edge, int(rect_w / ratio))
rect_h = min(rect_h, int(h * 0.8))
# gen rotate angle
rotation_angle = 0
rand_value = np.random.rand()
if rand_value < 0.7:
pass
elif rand_value < 0.8:
rotation_angle = np.random.randint(0, 40)
elif rand_value < 0.9:
rotation_angle = np.random.randint(140, 180)
else:
rotation_angle = np.random.randint(85, 95)
# rand position
x = np.random.randint(0, w - rect_w)
y = np.random.randint(0, h - rect_h)
# get vertex
rect_pts = cv2.boxPoints(
((rect_w / 2, rect_h / 2), (rect_w, rect_h), rotation_angle)
)
rect_pts = np.int32(rect_pts)
# move
rect_pts += (x, y)
# check boarder
if (
np.any(rect_pts < 0)
or np.any(rect_pts[:, 0] >= w)
or np.any(rect_pts[:, 1] >= h)
):
attempts += 1
continue
# check overlap
if any(check_overlap_polygon(rect_pts, rp) for rp in rectangles):
attempts += 1
continue
n_pass += 1
cv2.fillPoly(img, [rect_pts], 255)
rectangles.append(rect_pts)
if n_pass == n:
break
print("attempts:", attempts)
if len(rectangles) != n:
raise gr.Error(
f"Failed in auto generate positions after {attempts} attempts, try again!"
)
return img
def draw_rects(width, height, rects):
img = np.zeros((height, width, 1), dtype=np.uint8)
for rect in rects:
x1 = int(rect[0] * width)
y1 = int(rect[1] * height)
w = int(rect[2] * width)
h = int(rect[3] * height)
x2 = x1 + w
y2 = y1 + h
cv2.rectangle(img, (x1, y1), (x2, y2), 255, -1)
return img
def inference(input_data, **params):
ddim_sampler = myDDIMSampler(
model,start_step=params["start_op_step"],end_step=params["end_op_step"], max_op_step=params["OPTIMIZE_STEPS"],loss_alpha=params["alpha"],loss_beta=params["beta"],add_theta=params["theta"]
)
(
H,
W,
) = (params["image_height"], params["image_width"])
prompt = input_data["prompt"]
seed = input_data["seed"]
if seed == -1:
seed = random.randint(0, 65535)
seed_everything(seed)
if save_memory:
model.low_vram_shift(is_diffusing=False)
info = {}
info["glyphs"] = []
info["gly_line"] = []
info["positions"] = []
info["angles"] = []
info["Mats_r"] = []
info["lambda"]=params["lambda"]
info["step_size"]=params["step_size"]
positions = []
now = datetime.datetime.now()
date_str = now.strftime("%Y-%m-%d")
time_str = now.strftime("%H_%M_%S")
save_with_para = os.path.join(img_save_folder,f"""alpha_{params["alpha"]}_beta_{params["beta"]}_theta_{params["theta"]}_{int(params["start_op_step"])}->{int(params["end_op_step"])}_{int(params["OPTIMIZE_STEPS"])}""",date_str)
if not os.path.exists(os.path.join(save_with_para, "glyph")):
os.makedirs(os.path.join(save_with_para, "glyph"))
# %--------------------------------------%
# LLM separating masks
if params["use_gpt"]:
texts = []
info["texts"] = []
OPENAI_API_KEY = "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"
_, prompt = get_lines(prompt)
pos_json = GPT4_maskGen(input_data["prompt"],OPENAI_API_KEY,api_model="gpt-4o")
for idx,js in enumerate(pos_json):
tx = js["Texts"]
texts+=[tx]
info["texts"] += [js["Texts"]]
cx,cy,w,h,angle = js["OBB"]
cx = cx * W
cy = cy * H
h = h * H
w = w * W
pos, glyphs = get_pos_n_glyph(cx,cy,w,h,angle,tx)
positions+=[pos]
info["glyphs"] += [arr2tensor(glyphs, params["image_count"])]
info["positions"] += [arr2tensor(positions[idx], params["image_count"])] # shape [batch_size ,1 , 512, 512]
if idx==0:
replace_str = '" * "'
else:
replace_str += ', " * "'
prompt = prompt.replace('" * "', replace_str)
# %--------------------------------------%
else:
pos_imgs = input_data["draw_pos"]
polygons = sep_mask(pos_imgs, params["sort_priority"]) # list([numpoints, 1, 2])
info["polygons"] = polygons
texts, prompt = get_lines(prompt)
info["texts"] = texts
for idx,text in enumerate(texts):
glyphs, angle, center = draw_glyph3(ImageFont.truetype(params["Font_path"],size=60), text, polygons[idx], scale=2)
cv2.imwrite(os.path.join(save_with_para, "glyph",f"{time_str}_{idx}.png"), glyphs*255)
info["glyphs"] += [arr2tensor(glyphs, params["image_count"])]
for idx, poly in enumerate(polygons):
positions += [draw_pos(poly, 1.0)]
info["positions"] += [arr2tensor(positions[idx], params["image_count"])] # shape [batch_size ,1 , 512, 512]
for idx, text in enumerate(texts):
gly_line = draw_glyph(font, text)
info["gly_line"] += [arr2tensor(gly_line, params["image_count"])]
# padding
n_lines = min(len(texts), max_lines)
info["n_lines"] = [n_lines] * params["image_count"]
# get masked_x
hint = np.sum(positions, axis=0).clip(0, 1)
# print(hint.shape)
ref_img = np.ones((H, W, 3)) * 127.5
masked_img = ((ref_img.astype(np.float32) / 127.5) - 1.0) * (1 - hint)
masked_img = np.transpose(masked_img, (2, 0, 1))
masked_img = torch.from_numpy(masked_img.copy()).float().cuda()
encoder_posterior = model.encode_first_stage(masked_img[None, ...])
masked_x = model.get_first_stage_encoding(encoder_posterior).detach()
info["masked_x"] = torch.cat(
[masked_x for _ in range(params["image_count"])], dim=0
)
info["times"] = params["times"]
info["times1"] = params["times1"]
hint = arr2tensor(hint, params["image_count"])
batch_size = params["image_count"]
# prompt = prompt.replace(" * ","")
print("Prompt:",prompt)
cond = model.get_learned_conditioning(
dict(
c_concat=[hint],
c_crossattn=[[prompt + "," + params["a_prompt"]] * batch_size],
text_info=info,
)
)
un_cond = model.get_learned_conditioning(
dict(
c_concat=[hint],
c_crossattn=[[params["n_prompt"]] * batch_size],
text_info=info,
)
)
shape = (4, H // 8, W // 8)
if save_memory:
model.low_vram_shift(is_diffusing=True)
model.control_scales = [params["strength"]] * 13
tic = time.time()
samples, sampels_other, intermediates = ddim_sampler.sample(
params["ddim_steps"],
batch_size,
shape,
cond,
log_every_t=1,
verbose=False,
eta=params["eta"],
unconditional_guidance_scale=params["cfg_scale"],
unconditional_conditioning=un_cond,
)
# for idx, x_inter in enumerate(intermediates["x_inter"]):
# # print(idx)
# x_inter = model.decode_first_stage(x_inter)
# x_inter = (
# (einops.rearrange(x_inter, "b c h w -> b h w c") * 127.5 + 127.5)
# .cpu()
# .numpy()
# .clip(0, 255)
# .astype(np.uint8)
# )
# if not os.path.exists(os.path.join(img_save_folder,"inter")):
# os.makedirs(os.path.join(img_save_folder, "inter"))
# cv2.imwrite(os.path.join(img_save_folder, "inter",f"{idx}.png"), x_inter[0])
cost = (time.time() - tic) * 1000.0
if save_memory:
model.low_vram_shift(is_diffusing=False)
x_samples = model.decode_first_stage(samples)
x_samples = (
(einops.rearrange(x_samples, "b c h w -> b h w c") * 127.5 + 127.5)
.cpu()
.numpy()
.clip(0, 255)
.astype(np.uint8)
)
# print(x_samples.shape)
now = datetime.datetime.now()
date_str = now.strftime("%Y-%m-%d")
time_str = now.strftime("%H_%M_%S")
# for idx, x_inter in enumerate(intermediates["x_inter"]):
# # print(idx)
# # x_inter = model.decode_first_stage(x_inter)
# # x_inter = (einops.rearrange(x_inter, "b c h w -> b h w c") * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
# decode_x0 = model.decode_first_stage(intermediates["pred_x0_other"][idx])
# decode_x0 = torch.clamp(decode_x0, -1, 1)
# decode_x0 = (decode_x0 + 1.0) / 2.0 * 255 # -1,1 -> 0,255; n, c,h,w
# decode_x0 = einops.rearrange(decode_x0, "b c h w -> b h w c").cpu().numpy().clip(0, 255).astype(np.uint8)
# # if not os.path.exists(os.path.join(img_save_folder,"inter")):
# # os.makedirs(os.path.join(img_save_folder, "inter"))
# # cv2.imwrite(os.path.join(img_save_folder, "inter",f"{idx}.png"), x_inter[0])
# save_with_para = os.path.join(img_save_folder,f"""alpha_{params["alpha"]}_beta_{params["beta"]}_theta_{params["theta"]}_{int(params["start_op_step"])}->{int(params["end_op_step"])}_{int(params["OPTIMIZE_STEPS"])}""",date_str)
# if not os.path.exists(os.path.join(save_with_para, "inter_other")):
# os.makedirs(os.path.join(save_with_para, "inter_other"))
# cv2.imwrite(os.path.join(save_with_para, "inter_other",f"{time_str}_{idx}.png"), decode_x0[0])
results = [x_samples[i] for i in range(batch_size)]
x_samples_other = model.decode_first_stage(sampels_other)
x_samples_other = ((einops.rearrange(x_samples_other, "b c h w -> b h w c") * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8))
# results += [x_samples_other[i] for i in range(batch_size)]
# results = x_samples
results += [cost]
return results
def process(
mode,
prompt,
pos_radio,
sort_radio,
revise_pos,
base_model_path,
lora_path_ratio,
show_debug,
draw_img,
rect_img,
ref_img,
ori_img,
img_count,
ddim_steps,
w,
h,
strength,
cfg_scale,
seed,
eta,
a_prompt,
n_prompt,
times,
times1,
step_size,
lamb,
op_steps,
start_op_step,
end_op_step,
loss_alpha,
loss_beta,
add_theta,
use_gpt,
font_path,
*rect_list,
):
torch.cuda.empty_cache()
n_lines = count_lines(prompt)
# Text Generation
if mode == "gen":
# create pos_imgs
if pos_radio == "Manual-draw(手绘)":
if draw_img is not None:
pos_imgs = 255 - draw_img["image"]
if "mask" in draw_img:
pos_imgs = pos_imgs.astype(np.float32) + draw_img["mask"][
..., 0:3
].astype(np.float32)
pos_imgs = pos_imgs.clip(0, 255).astype(np.uint8)
else:
pos_imgs = np.zeros((w, h, 1))
elif pos_radio == "Manual-rect(拖框)":
rect_check = rect_list[:BBOX_MAX_NUM]
rect_xywh = rect_list[BBOX_MAX_NUM:]
checked_rects = []
for idx, c in enumerate(rect_check):
if c:
_xywh = rect_xywh[4 * idx : 4 * (idx + 1)]
checked_rects += [_xywh]
pos_imgs = draw_rects(w, h, checked_rects)
elif pos_radio == "Auto-rand(随机)":
pos_imgs = generate_rectangles(w, h, n_lines, max_trys=500)
# Text Editing
elif mode == "edit":
revise_pos = False # disable pos revise in edit mode
if ref_img is None or ori_img is None:
raise gr.Error("No reference image, please upload one for edit!")
edit_image = ori_img.clip(1, 255) # for mask reason
edit_image = check_channels(edit_image)
edit_image = resize_image(edit_image, max_length=768)
h, w = edit_image.shape[:2]
if (
isinstance(ref_img, dict)
and "mask" in ref_img
and ref_img["mask"].mean() > 0
):
pos_imgs = 255 - edit_image
edit_mask = cv2.resize(ref_img["mask"][..., 0:3], (w, h))
pos_imgs = pos_imgs.astype(np.float32) + edit_mask.astype(np.float32)
pos_imgs = pos_imgs.clip(0, 255).astype(np.uint8)
else:
if isinstance(ref_img, dict) and "image" in ref_img:
ref_img = ref_img["image"]
pos_imgs = 255 - ref_img # example input ref_img is used as pos
if not os.path.exists(os.path.join(img_save_folder,"masks")):
os.makedirs(os.path.join(img_save_folder, "masks"))
mask_save = os.path.join(img_save_folder,"masks",f"{prompt}.png")
while os.path.exists(mask_save):
mask_save = mask_save.replace(".png", "_1.png")
cv2.imwrite(mask_save, 255 - pos_imgs[..., ::-1])
# center = (pos_imgs.shape[0] // 2, pos_imgs.shape[1] // 2)
# rotMat = cv2.getRotationMatrix2D(center, angle, 1.0)
# rot_pos_imgs = cv2.warpAffine(pos_imgs,rotMat,pos_imgs.shape[1:-1])
# print(times)
params = {
"mode": mode,
"sort_priority": sort_radio,
"show_debug": show_debug,
"revise_pos": revise_pos,
"image_count": img_count,
"ddim_steps": ddim_steps,
"image_width": w,
"image_height": h,
"strength": strength,
"cfg_scale": cfg_scale,
"eta": eta,
"a_prompt": a_prompt,
"n_prompt": n_prompt,
"base_model_path": base_model_path,
"lora_path_ratio": lora_path_ratio,
"times":times,
"times1":times1,
"step_size":step_size,
"lambda":lamb,
"start_op_step":int(start_op_step),
"end_op_step":int(end_op_step),
"OPTIMIZE_STEPS":int(op_steps),
"alpha":loss_alpha,
"beta":loss_beta,
"theta":add_theta,
"use_gpt": use_gpt,
"Font_path": font_path,
}
input_data = {
"prompt": prompt,
"seed": seed,
"draw_pos": pos_imgs, # numpy (w, h, 1)
"ori_image": ori_img,
}
results = inference(input_data, **params)
time = results.pop()
# if rtn_code >= 0:
save_with_para = os.path.join(img_save_folder,f"alpha_{loss_alpha}_beta_{loss_beta}_theta_{add_theta}_{int(start_op_step)}->{int(end_op_step)}_{int(op_steps)}")
save_images(results, save_with_para)
print(f"Done, result images are saved in: {save_with_para}")
# if rtn_warning:
# gr.Warning(rtn_warning)
# else:
# raise gr.Error(rtn_warning)
return results , gr.Markdown(f"Time: {time}", visible=show_debug)
def create_canvas(w=512, h=512, c=3, line=5):
image = np.full((h, w, c), 200, dtype=np.uint8)
for i in range(h):
if i % (w // line) == 0:
image[i, :, :] = 150
for j in range(w):
if j % (w // line) == 0:
image[:, j, :] = 150
image[h // 2 - 8 : h // 2 + 8, w // 2 - 8 : w // 2 + 8, :] = [200, 0, 0]
return image
def resize_w(w, img1, img2):
if isinstance(img2, dict):
img2 = img2["image"]
return [cv2.resize(img1, (w, img1.shape[0])), cv2.resize(img2, (w, img2.shape[0]))]
def resize_h(h, img1, img2):
if isinstance(img2, dict):
img2 = img2["image"]
return [cv2.resize(img1, (img1.shape[1], h)), cv2.resize(img2, (img2.shape[1], h))]
def mask_upload(mask_path):
mask_pic = cv2.imread(mask_path.name)
return mask_pic
is_t2i = "true"
block = gr.Blocks(css="style.css", theme=gr.themes.Soft()).queue()
with open("javascript/bboxHint.js", "r") as file:
value = file.read()
escaped_value = json.dumps(value)
with block:
block.load(
fn=None,
_js=f"""() => {{
const script = document.createElement("script");
const text = document.createTextNode({escaped_value});
script.appendChild(text);
document.head.appendChild(script);
}}""",
)
gr.HTML(
'<div style="text-align: center; margin: 20px auto;"> \
<img id="banner" src="file/example_images/banner.png" alt="anytext"> <br> \
[<a href="https://arxiv.org/abs/2311.03054" style="color:blue; font-size:18px;">arXiv</a>] \
[<a href="https://github.com/tyxsspa/AnyText" style="color:blue; font-size:18px;">Code</a>] \
[<a href="https://modelscope.cn/models/damo/cv_anytext_text_generation_editing/summary" style="color:blue; font-size:18px;">ModelScope</a>]\
[<a href="https://huggingface.co/spaces/modelscope/AnyText" style="color:blue; font-size:18px;">HuggingFace</a>]\
version: 1.1.3 </div>'
)
with gr.Row(variant="compact"):
with gr.Column() as left_part:
pass
with gr.Column():
result_gallery = gr.Gallery(
label="Result(结果)",
show_label=True,
preview=True,
columns=2,
allow_preview=True,
height=600,
)
result_info = gr.Markdown("", visible=False)
Font_path = gr.Textbox(label="字体路径", value="./font/Arial_Unicode.ttf")
with gr.Row():
gr.Markdown("")
use_gpt = gr.Checkbox(label="Use GPT(是否使用gpt生成mask)", value=True)
run_gen = gr.Button(value="Run(运行)!", scale=0.3, elem_classes="run")
gr.Markdown("")
with gr.Row():
gr.Markdown("")
run_edit = gr.Button(value="Run Edit(运行编辑)!", scale=0.3, elem_classes="run")
gr.Markdown("")
with left_part:
with gr.Accordion(
"🕹Instructions(说明)",
open=False,
):
with gr.Tabs():
with gr.Tab("English"):
gr.Markdown(
'<span style="color:#3B5998;font-size:20px">Run Examples</span>'
)
gr.Markdown(
'<span style="color:#575757;font-size:16px">AnyText has two modes: Text Generation and Text Editing, and we provides a variety of examples. Select one, click on [Run!] button to run.</span>'
)
gr.Markdown(
'<span style="color:gray;font-size:12px">Please note, before running examples, ensure the manual draw area is empty, otherwise may get wrong results. Additionally, different examples use \
different parameters (such as resolution, seed, etc.). When generate your own, please pay attention to the parameter changes, or refresh the page to restore the default parameters.</span>'
)
gr.Markdown(
'<span style="color:#3B5998;font-size:20px">Text Generation</span>'
)
gr.Markdown(
'<span style="color:#575757;font-size:16px">Enter the textual description (in Chinese or English) of the image you want to generate in [Prompt]. Each text line that needs to be generated should be \
enclosed in double quotes. Then, manually draw the specified position for each text line to generate the image.</span>\
<span style="color:red;font-size:16px">The drawing of text positions is crucial to the quality of the resulting image</span>, \
<span style="color:#575757;font-size:16px">please do not draw too casually or too small. The number of positions should match the number of text lines, and the size of each position should be matched \
as closely as possible to the length or width of the corresponding text line. If [Manual-draw] is inconvenient, you can try dragging rectangles [Manual-rect] or random positions [Auto-rand].</span>'
)
gr.Markdown(
'<span style="color:gray;font-size:12px">When generating multiple lines, each position is matched with the text line according to a certain rule. The [Sort Position] option is used to \
determine whether to prioritize sorting from top to bottom or from left to right. You can open the [Show Debug] option in the parameter settings to observe the text position and glyph image \
in the result. You can also select the [Revise Position] which uses the bounding box of the rendered text as the revised position. However, it is occasionally found that the creativity of the \
generated text is slightly lower using this method.</span>'
)
gr.Markdown(
'<span style="color:#3B5998;font-size:20px">Text Editing</span>'
)
gr.Markdown(
'<span style="color:#575757;font-size:16px">Please upload an image in [Ref] as a reference image, then adjust the brush size, and mark the area(s) to be edited. Input the textual description and \
the new text to be modified in [Prompt], then generate the image.</span>'
)
gr.Markdown(
'<span style="color:gray;font-size:12px">The reference image can be of any resolution, but it will be internally processed with a limit that the longer side cannot exceed 768 pixels, and the \
width and height will both be scaled to multiples of 64.</span>'
)
with gr.Tab("简体中文"):
gr.Markdown(
'<span style="color:#3B5998;font-size:20px">运行示例</span>'
)
gr.Markdown(
'<span style="color:#575757;font-size:16px">AnyText有两种运行模式:文字生成和文字编辑,每种模式下提供了丰富的示例,选择一个,点击[Run!]即可。</span>'
)
gr.Markdown(
'<span style="color:gray;font-size:12px">请注意,运行示例前确保手绘位置区域是空的,防止影响示例结果,另外不同示例使用不同的参数(如分辨率,种子数等),如果要自行生成时,请留意参数变化,或刷新页面恢复到默认参数。</span>'
)
gr.Markdown(
'<span style="color:#3B5998;font-size:20px">文字生成</span>'
)
gr.Markdown(
'<span style="color:#575757;font-size:16px">在Prompt中输入描述提示词(支持中英文),需要生成的每一行文字用双引号包裹,然后依次手绘指定每行文字的位置,生成图片。</span>\
<span style="color:red;font-size:16px">文字位置的绘制对成图质量很关键</span>, \
<span style="color:#575757;font-size:16px">请不要画的太随意或太小,位置的数量要与文字行数量一致,每个位置的尺寸要与对应的文字行的长短或宽高尽量匹配。如果手绘(Manual-draw)不方便,\
可以尝试拖框矩形(Manual-rect)或随机生成(Auto-rand)。</span>'
)
gr.Markdown(
'<span style="color:gray;font-size:12px">多行生成时,每个位置按照一定规则排序后与文字行做对应,Sort Position选项用于确定排序时优先从上到下还是从左到右。\
可以在参数设置中打开Show Debug选项,在结果图像中观察文字位置和字形图。也可以勾选Revise Position选项,这样会用渲染文字的外接矩形作为修正后的位置,不过偶尔发现这样生成的文字创造性略低。</span>'
)
gr.Markdown(
'<span style="color:#3B5998;font-size:20px">文字编辑</span>'
)
gr.Markdown(
'<span style="color:#575757;font-size:16px">请上传一张待编辑的图片作为参考图(Ref),然后调整笔触大小后,在参考图上涂抹要编辑的位置,在Prompt中输入描述提示词和要修改的文字内容,生成图片。</span>'
)
gr.Markdown(
'<span style="color:gray;font-size:12px">参考图可以为任意分辨率,但内部处理时会限制长边不能超过768,并且宽高都被缩放为64的整数倍。</span>'
)
with gr.Accordion("🛠Parameters(参数)", open=False):
with gr.Row(variant="compact"):
img_count = gr.Slider(
label="Image Count(图片数)",
minimum=1,
maximum=12,
value=1,
step=1,
)
ddim_steps = gr.Slider(
label="Steps(步数)", minimum=1, maximum=100, value=20, step=1
)
with gr.Row(variant="compact"):
image_width = gr.Slider(
label="Image Width(宽度)",
minimum=256,
maximum=768,
value=512,
step=64,
)
image_height = gr.Slider(
label="Image Height(高度)",
minimum=256,
maximum=768,
value=512,
step=64,
)
with gr.Row(variant="compact"):
strength = gr.Slider(
label="Strength(控制力度)",
minimum=0.0,
maximum=2.0,
value=1.0,
step=0.01,
)
cfg_scale = gr.Slider(
label="CFG-Scale(CFG强度)",
minimum=0.1,
maximum=30.0,
value=9.0,
step=0.1,
)
with gr.Row(variant="compact"):
seed = gr.Slider(
label="Seed(种子数)",
minimum=-1,
maximum=99999999,
step=1,
randomize=False,
value=-1,
)
eta = gr.Number(label="eta (DDIM)", value=0.0)
with gr.Row(variant="compact"):
show_debug = gr.Checkbox(label="Show Debug(调试信息)", value=False)
gr.Markdown(
'<span style="color:silver;font-size:12px">whether show glyph image and debug information in the result(是否在结果中显示glyph图以及调试信息)</span>'
)
with gr.Row(variant="compact"):
Clip_times = gr.Number(label="Clip Times(Clip Embedding增强倍数)", value=1.0)
Clip_times1 = gr.Number(
label="Clip Times(内容外的Clip Embedding增强倍数)", value=1.0
)
a_prompt = gr.Textbox(
label="Added Prompt(附加提示词)",
value="best quality, extremely detailed,4k, HD, supper legible text, clear text edges, clear strokes, neat writing, no watermarks",
)
n_prompt = gr.Textbox(
label="Negative Prompt(负向提示词)",
value="low-res, bad anatomy, extra digit, fewer digits, cropped, worst quality, low quality, watermark, unreadable text, messy words, distorted text, disorganized writing, advertising picture",
)
base_model_path = gr.Textbox(label="Base Model Path(基模地址)")
lora_path_ratio = gr.Textbox(label="LoRA Path and Ratio(lora地址和比例)")
with gr.Accordion("🛠Optimization Parameters(优化参数)", open=False):
with gr.Row(variant="compact"):
step_size = gr.Number(label="Step Size(优化步长)", value=0)
lamd = gr.Number(label="Lambda(loss系数)",value=0)
op_step = gr.Number(label="Optimize Steps(优化步数)", value=1)
with gr.Row(variant="compact"):
start_op_step = gr.Number(label="Start Optimize Step(结束优化的inference步数)", value=0)
end_op_step = gr.Number(label="End Optimize Step(结束优化的inference步数)", value=10)
with gr.Row(variant="compact"):
loss_alpha = gr.Number(label="Alpha(Ocr Loss系数)", value=1)
loss_beta = gr.Number(label="Beta(MSE Loss 系数)", value=0)
add_theta = gr.Number(label="Theta(Add glyph 系数)", value=0.35)
prompt = gr.Textbox(label="Prompt(提示词)")
with gr.Tabs() as tab_modes:
with gr.Tab(
"🖼Text Generation(文字生成)", elem_id="MD-tab-t2i"
) as mode_gen:
pos_radio = gr.Radio(
["Manual-draw(手绘)", "Manual-rect(拖框)", "Auto-rand(随机)"],
value="Manual-draw(手绘)",
label="Pos-Method(位置方式)",
info="choose a method to specify text positions(选择方法用于指定文字位置).",
)
with gr.Row():
sort_radio = gr.Radio(
["↕", "↔"],
value="↕",
label="Sort Position(位置排序)",
info="position sorting priority(位置排序时的优先级)",
)
revise_pos = gr.Checkbox(
label="Revise Position(修正位置)", value=False
)
upload_button = gr.UploadButton(
"Click to upload Mask", file_types=["image"]
)
# gr.Markdown('<span style="color:silver;font-size:12px">try to revise according to text\'s bounding rectangle(尝试通过渲染后的文字行的外接矩形框修正位置)</span>')
with gr.Row(variant="compact"):
rect_cb_list: list[Component] = []
rect_xywh_list: list[Component] = []
for i in range(BBOX_MAX_NUM):
e = gr.Checkbox(
label=f"{i}", value=False, visible=False, min_width="10"
)
x = gr.Slider(
label="x",
value=0.4,
minimum=0.0,
maximum=1.0,
step=0.0001,
elem_id=f"MD-t2i-{i}-x",
visible=False,
)
y = gr.Slider(
label="y",
value=0.4,
minimum=0.0,
maximum=1.0,
step=0.0001,
elem_id=f"MD-t2i-{i}-y",
visible=False,
)
w = gr.Slider(
label="w",
value=0.2,
minimum=0.0,
maximum=1.0,
step=0.0001,
elem_id=f"MD-t2i-{i}-w",
visible=False,
)
h = gr.Slider(
label="h",
value=0.2,
minimum=0.0,
maximum=1.0,
step=0.0001,
elem_id=f"MD-t2i-{i}-h",
visible=False,
)
x.change(
fn=None,
inputs=x,
outputs=x,
_js=f'v => onBoxChange({is_t2i}, {i}, "x", v)',
show_progress=False,
queue=False,
)
y.change(
fn=None,
inputs=y,
outputs=y,
_js=f'v => onBoxChange({is_t2i}, {i}, "y", v)',
show_progress=False,
queue=False,
)
w.change(
fn=None,
inputs=w,
outputs=w,
_js=f'v => onBoxChange({is_t2i}, {i}, "w", v)',
show_progress=False,
queue=False,
)
h.change(
fn=None,
inputs=h,
outputs=h,
_js=f'v => onBoxChange({is_t2i}, {i}, "h", v)',
show_progress=False,
queue=False,
)
e.change(
fn=None,
inputs=e,
outputs=e,
_js=f"e => onBoxEnableClick({is_t2i}, {i}, e)",
queue=False,
)
rect_cb_list.extend([e])
rect_xywh_list.extend([x, y, w, h])
rect_img = gr.Image(
value=create_canvas(),
label="Rext Position(方框位置)",
elem_id="MD-bbox-rect-t2i",
show_label=False,
visible=False,
)
draw_img = gr.Image(
value=create_canvas(),
label="Draw Position(绘制位置)",
visible=True,
tool="sketch",
show_label=False,
brush_radius=100,
)
def re_draw():
return [
gr.Image(value=create_canvas(), tool="sketch"),
gr.Slider(value=512),
gr.Slider(value=512),
]
draw_img.clear(re_draw, None, [draw_img, image_width, image_height])
image_width.release(
resize_w,
[image_width, rect_img, draw_img],
[rect_img, draw_img],
)
image_height.release(
resize_h,
[image_height, rect_img, draw_img],
[rect_img, draw_img],
)
def change_options(selected_option):
return [
gr.Checkbox(visible=selected_option == "Manual-rect(拖框)")
] * BBOX_MAX_NUM + [
gr.Image(visible=selected_option == "Manual-rect(拖框)"),
gr.Image(visible=selected_option == "Manual-draw(手绘)"),
gr.Radio(visible=selected_option != "Auto-rand(随机)"),
gr.Checkbox(value=selected_option == "Auto-rand(随机)"),
]
pos_radio.change(
change_options,
pos_radio,
rect_cb_list + [rect_img, draw_img, sort_radio, revise_pos],
show_progress=False,
queue=False,
)
# with gr.Row():
# gr.Markdown("")
# run_gen = gr.Button(
# value="Run(运行)!", scale=0.3, elem_classes="run"
# )
# gr.Markdown("")
def exp_gen_click():
return [
gr.Slider(value=512),
gr.Slider(value=512),
] # all examples are 512x512, refresh draw_img
with gr.Tab("English Examples"):
exp_gen_en = gr.Examples(
[
[
'A raccoon stands in front of the blackboard with the words "Deep Learning" written on it',
"example_images/gen17.png",
"Manual-draw(手绘)",
"↕",
False,
4,
33789703,
],
[
'A crayon drawing by child, a snowman with a Santa hat, pine trees, outdoors in heavy snowfall, titled "Snowman"',
"example_images/gen18.png",
"Manual-draw(手绘)",
"↕",
False,
4,
35621187,
],
[
'A meticulously designed logo, a minimalist brain, stick drawing style, simplistic style, refined with minimal strokes, black and white color, white background, futuristic sense, exceptional design, logo name is "NextAI"',
"example_images/gen19.png",
"Manual-draw(手绘)",
"↕",
False,
4,
2563689,
],
[
'A photograph of the colorful graffiti art on the wall with the words "Hi~" "Get Ready" "to" "Party"',
"example_images/gen21.png",
"Manual-draw(手绘)",
"↕",
False,
4,
88952132,
],
[
'photo of caramel macchiato coffee on the table, top-down perspective, with "Any" "Text" written on it using cream',
"example_images/gen9.png",
"Manual-draw(手绘)",
"↕",
False,
4,
66273235,
],
[
'A fine sweater with knitted text: "Have" "A" "Good Day"',
"example_images/gen20.png",
"Manual-draw(手绘)",
"↕",
False,
4,
35107824,
],
[
'Sign on the clean building that reads "科学" and "과학" and "ステップ" and "SCIENCE"',
"example_images/gen6.png",
"Manual-draw(手绘)",
"↕",
True,
4,
13246309,
],
[
'A delicate square cake, cream and fruit, with "CHEERS" "to the" and "GRADUATE" written in chocolate',
"example_images/gen8.png",
"Manual-draw(手绘)",