plugins/emoji/update_emoji.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213

"""
Update Emoji.py
Refeshes OMZ emoji database based on the latest Unicode spec
"""
import re
import json

spec = open("emoji-data.txt", "r")

# Regexes
# regex_emoji will return, respectively:
# the code points, its type (status), the actual emoji, and its official name
regex_emoji = r"^([\w ].*?\S)\s*;\s*([\w-]+)\s*#\s*(.*?)\s(\S.*).*$"
# regex_group returns the group of subgroup that a line opens
regex_group = r"^#\s*(group|subgroup):\s*(.*)$"

headers = """
# emoji-char-definitions.zsh - Emoji definitions for oh-my-zsh emoji plugin
#
# This file is auto-generated by update_emoji.py. Do not edit it manually.
#
# This contains the definition for:
#   $emoji         - which maps character names to Unicode characters
#   $emoji_flags   - maps country names to Unicode flag characters using region
#                    indicators
#   $emoji_mod     - maps modifier components to Unicode characters
#   $emoji_groups  - a single associative array to avoid cluttering up the
#                    global namespace, and to allow adding additional group
#                    definitions at run time. The keys are the group names, and
#                    the values are whitespace-separated lists of emoji
#                    character names.

# Main emoji
typeset -gAH emoji
# National flags
typeset -gAH emoji_flags
# Combining modifiers
typeset -gAH emoji_mod
# Emoji groups
typeset -gAH emoji_groups
"""

#######
# Adding country codes
#######
# This is the only part of this script that relies on an external library
# (country_converter), and is hence commented out by default.
# You can uncomment it to have country codes added as aliases for flag
# emojis. (By default, when you install this extension, country codes are
# included as aliases, but not if you re-run this script without uncommenting.)
# Warning: country_converter is very verbose, and will print warnings all over
# your terminal.

# import country_converter as coco # pylint: disable=wrong-import-position
# cc = coco.CountryConverter()

# def country_iso(_all_names, _omz_name):
#     """ Using the external library country_converter,
#         this function can detect the ISO2 and ISO3 codes
#         of the country. It takes as argument the array
#         with all the names of the emoji, and returns that array."""
#     omz_no_underscore = re.sub(r'_', r' ', _omz_name)
#     iso2 = cc.convert(names=[omz_no_underscore], to='ISO2')
#     if iso2 != 'not found':
#         _all_names.append(iso2)
#         iso3 = cc.convert(names=[omz_no_underscore], to='ISO3')
#         _all_names.append(iso3)
#     return _all_names


#######
# Helper functions
#######

def code_to_omz(_code_points):
    """ Returns a ZSH-compatible Unicode string from the code point(s) """
    return r'\U' + r'\U'.join(_code_points.split(' '))

def name_to_omz(_name, _group, _subgroup, _status):
    """ Returns a reasonable snake_case name for the emoji. """
    def snake_case(_string):
        """ Does the regex work of snake_case """
        remove_dots = re.sub(r'\.\(\)', r'', _string)
        replace_ands = re.sub(r'\&', r'and', remove_dots)
        remove_whitespace = re.sub(r'[^\#\*\w]', r'_', replace_ands)
        return re.sub(r'__', r'_', remove_whitespace)

    shortname = ""
    split_at_colon = lambda s: s.split(": ")
    # Special treatment by group and subgroup
    # If the emoji is a flag, we strip "flag" from its name
    if _group == "Flags" and len(split_at_colon(_name)) > 1:
        shortname = snake_case(split_at_colon(_name)[1])
    else:
        shortname = snake_case(_name)
    # Special treatment by status
    # Enables us to have every emoji combination,
    # even the one that are not officially sanctionned
    # and are implemented by, say, only one vendor
    if _status == "unqualified":
        shortname += "_unqualified"
    elif _status == "minimally-qualified":
        shortname += "_minimally"
    return shortname

def increment_name(_shortname):
    """ Increment the short name by 1. If you get, say,
    'woman_detective_unqualified', it returns
    'woman_detective_unqualified_1', and then
    'woman_detective_unqualified_2', etc. """
    last_char = _shortname[-1]
    if last_char.isdigit():
        num = int(last_char)
        return _shortname[:-1] + str(num + 1)
    return _shortname + "_1"

########
# Going through every line
########

group, subgroup, short_name_buffer = "", "", ""
emoji_database = []
for line in spec:
    # First, test if this line opens a group or subgroup
    group_match = re.findall(regex_group, line)
    if group_match != []:
        gr_or_sub, name = group_match[0]
        if gr_or_sub == "group":
            group = name
        elif gr_or_sub == "subgroup":
            subgroup = name
        continue # Moving on...
    # Second, test if this line references one emoji
    emoji_match = re.findall(regex_emoji, line)
    if emoji_match != []:
        code_points, status, emoji, name = emoji_match[0]
        omz_codes = code_to_omz(code_points)
        omz_name = name_to_omz(name, group, subgroup, status)
        # If this emoji has the same shortname as the preceding one
        if omz_name in short_name_buffer:
            omz_name = increment_name(short_name_buffer)
        short_name_buffer = omz_name
        emoji_database.append(
            [omz_codes, status, emoji, omz_name, group, subgroup])
spec.close()

########
# Write to emoji-char-definitions.zsh
########

# Aliases for emojis are retrieved through the DB of Gemoji
# Retrieved on Aug 9 2019 from the following URL:
# https://raw.githubusercontent.com/github/gemoji/master/db/emoji.json

gemoji_db = open("gemoji_db.json")
j = json.load(gemoji_db)
aliases_map = {entry['emoji']: entry['aliases'] for entry in j}
all_omz_names = [emoji_data[3] for emoji_data in emoji_database]

# Let's begin writing to this file
output = open("emoji-char-definitions.zsh", "w")
output.write(headers)

emoji_groups = {"fruits": "\n", "vehicles": "\n", "hands": "\n",
                "people": "\n", "animals": "\n", "faces": "\n",
                "flags": "\n"}

# First, write every emoji down
for _omz_codes, _status, _emoji, _omz_name, _group, _subgroup in emoji_database:

    # One emoji can be mapped to multiple names (aliases or country codes)
    names_for_this_emoji = [_omz_name]

    # Variable that indicates in which map the emoji will be located
    emoji_map = "emoji"
    if _status == "component":
        emoji_map = "emoji_mod"
    if _group == "Flags":
        emoji_map = "emoji_flags"
        # Adding country codes (Optional, see above)
        # names_for_this_emoji = country_iso(names_for_this_emoji, _omz_name)

    # Check if there is an alias available in the Gemoji DB
    if _emoji in aliases_map.keys():
        for alias in aliases_map[_emoji]:
            if alias not in all_omz_names:
                names_for_this_emoji.append(alias)

    # And now we write to the definitions file
    for one_name in names_for_this_emoji:
        output.write(f"{emoji_map}[{one_name}]=$'{_omz_codes}'\n")

    # Storing the emoji in defined subgroups for the next step
    if _status == "fully-qualified":
        if _subgroup == "food-fruit":
            emoji_groups["fruits"] += f"  {_omz_name}\n"
        elif "transport-" in _subgroup:
            emoji_groups["vehicles"] += f"  {_omz_name}\n"
        elif "hand-" in _subgroup:
            emoji_groups["hands"] += f"  {_omz_name}\n"
        elif "person-" in _subgroup or _subgroup == "family":
            emoji_groups["people"] += f"  {_omz_name}\n"
        elif "animal-" in _subgroup:
            emoji_groups["animals"] += f"  {_omz_name}\n"
        elif "face-" in _subgroup:
            emoji_groups["faces"] += f"  {_omz_name}\n"
        elif _group == "Flags":
            emoji_groups["flags"] += f"  {_omz_name}\n"

# Second, write the subgroups to the end of the file
for name, string in emoji_groups.items():
    output.write(f'\nemoji_groups[{name}]="{string}"\n')
output.close()