forked from Ezhil-Language-Foundation/open-tamil
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathregexp.py
85 lines (70 loc) · 2.44 KB
/
regexp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# -*- coding: utf-8 -*-
# (C) 2015 Muthiah Annamalai
# Licensed under GPL Version 3
import re
from . import utf8
# predicate
def is_containing_seq(start, end, seq):
return (start in seq) and (end in seq)
# expand
def expand_sequence(start, end, seq):
start_idx = seq.index(start)
end_idx = seq.index(end)
assert start_idx >= 0 and end_idx >= 0
start_idx, end_idx = min(start_idx, end_idx), max(start_idx, end_idx)
return u",".join(seq[start_idx: end_idx + 1])
def expand_tamil(start, end):
"""expand uyir or mei-letter range etc.
i.e. அ-ஔ gets converted to அ,ஆ,இ,ஈ,உ,ஊ,எ,ஏ,ஐ,ஒ,ஓ,ஔ etc.
"""
# few sequences
for seq in [
utf8.uyir_letters,
utf8.grantha_mei_letters,
utf8.grantha_agaram_letters,
]:
if is_containing_seq(start, end, seq):
return expand_sequence(start, end, seq)
# all Tamil letters
seq = utf8.grantha_uyirmei_letters
if is_containing_seq(start, end, seq):
return expand_sequence(start, end, seq)
raise Exception("Cannot understand sequence [%s-%s]" % (start, end))
def make_pattern(patt, flags=0):
"""
returns a compile regular expression object
"""
# print('input',len(patt))
patt_letters = utf8.get_letters(patt)
patt_out = list()
idx = 0
# print('output',len(patt_letters))
patt = [None, None]
prev = None
LEN_PATT = len(patt_letters)
while idx < LEN_PATT:
if utf8.istamil(patt_letters[idx]) and (
prev == "-" or ((idx + 1) < LEN_PATT and patt_letters[idx + 1] == u"-")
):
if (idx + 1) < LEN_PATT and patt_letters[idx + 1] == u"-":
patt[0] = patt_letters[idx]
idx = idx + 2
prev = "-"
elif prev == "-":
patt[1] = patt_letters[idx]
patt_out.extend(expand_tamil(patt[0], patt[1]))
idx = idx + 1
prev = patt_letters[idx]
continue
patt_out.extend(patt_letters[idx])
prev = patt_letters[idx]
idx = idx + 1
opattern = u"".join(patt_out)
compile_regexp = re.compile(opattern, flags)
return (compile_regexp, opattern)
def search(patt, inputstr):
custom_patt = make_pattern(patt)
return (re.search(custom_patt, inputstr), custom_patt)
def match(patt, inputstr):
custom_patt = make_pattern(patt)[0]
return (re.match(custom_patt, inputstr), custom_patt)