-
Notifications
You must be signed in to change notification settings - Fork 776
/
Copy pathimport_emojis.py
executable file
·147 lines (124 loc) · 6.66 KB
/
import_emojis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python3
import json
import os
import re
import requests
from bs4 import BeautifulSoup
from collections import OrderedDict
# A list of words to not capitalize in emoji-names
capitalization_exclude = {'with', 'a', 'at', 'of', 'for', 'and', 'over', 'the', 'off', 'on', 'out', 'in', 'but', 'or'}
# Create skeleton of the final json file as a python dictionary:
emoji_picker_datasource = {
"compressed": True,
"categories": [],
"emojis": {},
"aliases": {}
}
emoji_picker_datasource_categories = emoji_picker_datasource["categories"]
emoji_picker_datasource_emojis = emoji_picker_datasource["emojis"]
# Get official emoji list from unicode.org (Emoji List, v13.1 at time of writing)
print("Fetching emoji list from Unicode.org...")
req = requests.get("https://unicode.org/emoji/charts/emoji-list.html")
soup = BeautifulSoup(req.content, 'html.parser')
variation_sequence_data = requests.get("https://www.unicode.org/Public/15.0.0/ucd/emoji/emoji-variation-sequences.txt").text
variation_sequence_overrides = {}
for line in variation_sequence_data.split("\n"):
if "emoji style" in line:
emoji_hex = line.split(" ", 1)[0]
variation_sequence_overrides[emoji_hex] = emoji_hex + "-FE0F"
# Navigate to table
table = soup.body.table
# Go over all rows
print("Extracting emojis...")
for row in table.find_all('tr'):
# Add "bigheads" rows to categories
if 'bighead' in next(row.children)['class']:
relevant_element = row.find('a')
category_id = relevant_element['name']
category_name = relevant_element.text
emoji_picker_datasource_categories.append({
"id": category_id,
"name": category_name,
"emojis": []
})
# Add information in "rchars" rows to the last encountered category and emojis
if row.find('td', class_='code'):
# Get columns
cols = row.find_all('td')
no_element = cols[0]
code_element = cols[1]
sample_element = cols[2]
cldr_element = cols[3]
keywords_element = cols[4]
# Extract information from columns
# Extract name and id
# => Remove spaces, colons and unicode-characters
emoji_name = cldr_element.text
emoji_id = cldr_element.text.lower()
emoji_id = re.sub(r'[^A-Za-z0-9 ]+', '', emoji_id, flags=re.UNICODE) # Only keep alphanumeric, space characters
emoji_id = emoji_id.strip() # Remove leading/trailing whitespaces
emoji_id = emoji_id.replace(' ', '-')
# Capitalize name according to the same rules as the previous emoji_picker_datasource.json
# - Words are separated by any non-word character (\W), e.g. space, comma, parentheses, dots, etc.
# - Words are capitalized if they are either at the beginning of the name OR not in capitalization_exclude (extracted from the previous datasource, too)
emoji_name_cap = "".join([w.capitalize() if i == 0 or w not in capitalization_exclude else w for i, w in enumerate(re.split('(\W)', emoji_name))])
# Extract emoji unicode-codepoint
emoji_code_raw = code_element.text
emoji_code_list = emoji_code_raw.split(" ")
emoji_code_list = [e[2:] for e in emoji_code_list]
emoji_code = "-".join(emoji_code_list)
# Extract keywords
emoji_keywords = keywords_element.text.split(" | ")
# Add the emoji-id to the last entry in "categories"
emoji_picker_datasource_categories[-1]["emojis"].append(emoji_id)
# Add the emoji itself to the "emojis" dict
emoji_picker_datasource_emojis[emoji_id] = {
"a": emoji_name_cap,
"b": emoji_code,
"j": emoji_keywords
}
# The keywords of unicode.org are usually quite sparse.
# There is no official specification of keywords beyond that, but muan/emojilib maintains a well maintained and
# established repository with additional keywords. We extend our list with the keywords from there.
# At the time of writing it had additional keyword information for all emojis except a few from the newest unicode 13.1.
print("Fetching additional keywords from Emojilib...")
req = requests.get("https://raw.githubusercontent.com/muan/emojilib/main/dist/emoji-en-US.json")
emojilib_data = json.loads(req.content)
# We just go over all the official emojis from unicode, and add the keywords there
print("Adding keywords to emojis...")
for emoji in emoji_picker_datasource_emojis:
emoji_name = emoji_picker_datasource_emojis[emoji]["a"]
emoji_code = emoji_picker_datasource_emojis[emoji]["b"]
# Convert back to actual unicode emoji
emoji_unicode = ''.join(map(lambda s: chr(int(s, 16)), emoji_code.split("-")))
# Search for emoji in emojilib
if emoji_unicode in emojilib_data:
emoji_additional_keywords = emojilib_data[emoji_unicode]
elif emoji_unicode+chr(0xfe0f) in emojilib_data:
emoji_additional_keywords = emojilib_data[emoji_unicode+chr(0xfe0f)]
else:
print("* No additional keywords for", emoji_unicode, emoji_picker_datasource_emojis[emoji])
continue
# If additional keywords exist, add them to emoji_picker_datasource_emojis
# Avoid duplicates and keep order. Put official unicode.com keywords first and extend up with emojilib ones.
new_keywords = OrderedDict.fromkeys(emoji_picker_datasource_emojis[emoji]["j"] + emoji_additional_keywords)
# Remove the ones derived from the unicode name
for keyword in [emoji.replace("-", "_")] + [emoji.replace("-", " ")] + [emoji_name]:
if keyword in new_keywords:
new_keywords.pop(keyword)
# Write new keywords back
emoji_picker_datasource_emojis[emoji]["j"] = list(new_keywords.keys())
if emoji_code in variation_sequence_overrides:
emoji_picker_datasource_emojis[emoji]["b"] = variation_sequence_overrides[emoji_code]
# Filter out components from unicode 13.1 (as they are not suitable for single-emoji reactions)
emoji_picker_datasource['categories'] = [x for x in emoji_picker_datasource['categories'] if x['id'] != 'component']
# Write result to file (overwrite previous), without escaping unicode characters
print("Writing emoji_picker_datasource.json...")
scripts_dir = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(scripts_dir, "../vector/src/main/res/raw/emoji_picker_datasource.json"), "w") as outfile:
json.dump(emoji_picker_datasource, outfile, ensure_ascii=False, separators=(',', ':'))
# Also export a formatted version
print("Writing emoji_picker_datasource_formatted.json...")
with open(os.path.join(scripts_dir, "../tools/emojis/emoji_picker_datasource_formatted.json"), "w") as outfile:
json.dump(emoji_picker_datasource, outfile, ensure_ascii=False, indent=4)
print("Done.")