6x boost for quoting

aio-libs · Nov 2, 2016 · 5c59e0f · 5c59e0f
1 parent 9da189a
commit 5c59e0f
Show file tree

Hide file tree

Showing 4 changed files with 150 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -89,3 +89,7 @@ ENV/
 .ropeproject
 
 coverage
+
+
+yarl/_quoting.c
+yarl/_quoting.html
diff --git a/benchmark.py b/benchmark.py
@@ -0,0 +1,28 @@
+import timeit
+
+
+cython_setup = """\
+from yarl.quoting import _quote as quote
+from yarl.quoting import _unquote as unquote
+"""
+
+python_setup = """\
+from yarl.quoting import _py_quote as quote
+from yarl.quoting import _py_unquote as unquote
+"""
+
+
+print("Cython quote: {:.3f} sec".format(
+    timeit.timeit("quote(s)", cython_setup+"s='/path/to'")))
+
+
+print("Python quote: {:.3f} sec".format(
+    timeit.timeit("quote(s)", python_setup+"s='/path/to'")))
+
+
+print("Cython unquote: {:.3f} sec".format(
+    timeit.timeit("unquote(s)", cython_setup+"s='/path/to'")))
+
+
+print("Python unquote: {:.3f} sec".format(
+    timeit.timeit("unquote(s)", python_setup+"s='/path/to'")))
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,2 @@
+[aliases]
+test=pytest
diff --git a/yarl/_quoting.pyx b/yarl/_quoting.pyx
@@ -0,0 +1,116 @@
+# cython: language_level=3
+
+from string import ascii_letters, ascii_lowercase, digits
+
+cdef str ASCII_LOWERCASE = ascii_lowercase
+cdef str GEN_DELIMS = ":/?#[]@"
+cdef str SUB_DELIMS = "!$&'()*+,;="
+cdef str RESERVED = GEN_DELIMS + SUB_DELIMS
+cdef str UNRESERVED = ascii_letters + digits + '-._~'
+
+cdef set PCT_ALLOWED = {'%{:02X}'.format(i) for i in range(256)}
+cdef dict UNRESERVED_QUOTED = {'%{:02X}'.format(ord(ch)): ch
+                               for ch in UNRESERVED}
+
+
+cdef Py_UCS4 _hex(unsigned long v):
+    if v < 10:
+        return <Py_UCS4>(v+0x30)  # ord('0') == 0x30
+    else:
+        return <Py_UCS4>(v+0x41-10)  # ord('A') == 0x41
+
+
+def _quote(val, *, str safe='', bint plus=False):
+    if val is None:
+        return None
+    if not isinstance(val, str):
+        raise TypeError("Argument should be str")
+    if not val:
+        return ''
+    cdef str _val = <str>val
+    cdef list ret = []
+    cdef list pct = []
+    cdef unsigned char b
+    cdef Py_UCS4 ch
+    cdef str tmp
+    for ch in _val:
+        if pct:
+            if u'a' <= ch <= u'z':
+                ch = <Py_UCS4>(<unsigned long>ch - 32)
+            pct.append(ch)
+            if len(pct) == 3:
+                tmp = "".join(pct)
+                unquoted = UNRESERVED_QUOTED.get(tmp)
+                if unquoted:
+                    ret.append(unquoted)
+                elif tmp not in PCT_ALLOWED:
+                    raise ValueError("Unallowed PCT {}".format(pct))
+                else:
+                    ret.append(tmp)
+                del pct[:]
+            continue
+        elif ch == u'%':
+            pct = [ch]
+            continue
+
+        if plus:
+            if ch == u' ':
+                ret.append(u'+')
+                continue
+        if ch in UNRESERVED:
+            ret.append(ch)
+            continue
+        if ch in safe:
+            ret.append(ch)
+            continue
+
+        for b in <bytes>ch.encode('utf8'):
+            ret.append('%')
+            ret.append(_hex(<unsigned char>b >> 4))
+            ret.append(_hex(<unsigned char>b & 0x0f))
+
+    return ''.join(ret)
+
+
+def _unquote(val, *, unsafe='', plus=False):
+    if val is None:
+        return None
+    if not isinstance(val, str):
+        raise TypeError("Argument should be str")
+    if not val:
+        return ''
+    pct = ''
+    pcts = bytearray()
+    ret = []
+    for ch in val:
+        if pct:
+            pct += ch
+            if len(pct) == 3:  # pragma: no branch   # peephole optimizer
+                pcts.append(int(pct[1:], base=16))
+                pct = ''
+            continue
+        if pcts:
+            try:
+                unquoted = pcts.decode('utf8')
+            except UnicodeDecodeError:
+                pass
+            else:
+                if unquoted in unsafe:
+                    ret.append(_quote(unquoted))
+                else:
+                    ret.append(unquoted)
+                del pcts[:]
+
+        if ch == '%':
+            pct = ch
+            continue
+
+        ret.append(ch)
+
+    if pcts:
+        unquoted = pcts.decode('utf8')
+        if unquoted in unsafe:
+            ret.append(_quote(unquoted))
+        else:
+            ret.append(unquoted)
+    return ''.join(ret)