Source code for linkify_it.ucre

from uc_micro.categories import Cc, Cf, P, Z
from uc_micro.properties import Any

SRC_ANY = Any.REGEX
SRC_CC = Cc.REGEX
SRC_CF = Cf.REGEX
SRC_P = P.REGEX
SRC_Z = Z.REGEX

# \p{\Z\P\Cc\CF} (white spaces + control + format + punctuation)
SRC_ZPCC = "|".join([SRC_Z, SRC_P, SRC_CC])

# \p{\Z\Cc} (white spaces + control)
SRC_ZCC = "|".join([SRC_Z, SRC_CC])

# Experimental. List of chars, completely prohibited in links
# because can separate it from other part of text
TEXT_SEPARATORS = "[><\uff5c]"

# All possible word characters (everything without punctuation, spaces & controls)
# Defined via punctuation & spaces to save space
# Should be something like \p{\L\N\S\M} (\w but without `_`)
SRC_PSEUDO_LETTER = "(?:(?!" + TEXT_SEPARATORS + "|" + SRC_ZPCC + ")" + SRC_ANY + ")"
# The same as abothe but without [0-9]
# var SRC_PSEUDO_LETTER_non_d = '(?:(?![0-9]|' + SRC_ZPCC + ')' + SRC_ANY + ')'

# =============================================================================

SRC_IP4 = (
    "(?:(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|"
    + "2[0-4][0-9]|[01]?[0-9][0-9]?)"
)

# Prohibit any of "@/[]()" in user/pass to avoid wrong domain fetch.
SRC_AUTH = "(?:(?:(?!" + SRC_ZCC + "|[@/\\[\\]()]).)+@)?"

SRC_PORT = (
    "(?::(?:6(?:[0-4]\\d{3}|5(?:[0-4]\\d{2}|5(?:[0-2]\\d|3[0-5])))|[1-5]?\\d{1,4}))?"
)

# Allow anything in markdown spec, forbid quote (") at the first position
# because emails enclosed in quotes are far more common
SRC_EMAIL_NAME = '[\\-:&=\\+\\$,\\.a-zA-Z0-9_][\\-:&=\\+\\$,\\"\\.a-zA-Z0-9_]*'

SRC_XN = "xn--[a-z0-9\\-]{1,59}"

# More to read about domain names
# http:#serverfault.com/questions/638260/

# Allow letters & digits (http:#test1)
SRC_DOMAIN_ROOT = "(?:" + SRC_XN + "|" + SRC_PSEUDO_LETTER + "{1,63}" + ")"

SRC_DOMAIN = (
    "(?:"
    + SRC_XN
    + "|"
    + "(?:"
    + SRC_PSEUDO_LETTER
    + ")"
    + "|"
    + "(?:"
    + SRC_PSEUDO_LETTER
    + "(?:-|"
    + SRC_PSEUDO_LETTER
    + "){0,61}"
    + SRC_PSEUDO_LETTER
    + ")"
    + ")"
)

SRC_HOST = (
    "(?:"
    +
    # Don't need IP check, because digits are already allowed in normal domain names
    # SRC_IP4 +
    # '|' +
    "(?:(?:(?:"
    + SRC_DOMAIN
    + ")\\.)*"
    + SRC_DOMAIN  # _root
    + ")"
    + ")"
)

TPL_HOST_FUZZY = (
    "(?:" + SRC_IP4 + "|" + "(?:(?:(?:" + SRC_DOMAIN + ")\\.)+(?:%TLDS%))" + ")"
)

TPL_HOST_NO_IP_FUZZY = "(?:(?:(?:" + SRC_DOMAIN + ")\\.)+(?:%TLDS%))"


# =============================================================================

# Rude test fuzzy links by host, for quick deny
TPL_HOST_FUZZY_TEST = (
    "localhost|www\\.|\\.\\d{1,3}\\.|(?:\\.(?:%TLDS%)(?:" + SRC_ZPCC + "|>|$))"
)


def _re_host_terminator(opts):
    src_host_terminator = (
        "(?=$|"
        + TEXT_SEPARATORS
        + "|"
        + SRC_ZPCC
        + ")"
        + "(?!"
        + ("-(?!--)|" if opts.get("---") else "-|")
        + "_|:\\d|\\.-|\\.(?!$|"
        + SRC_ZPCC
        + "))"
    )
    return src_host_terminator


def _re_src_path(opts):
    src_path = (
        "(?:"
        + "[/?#]"
        + "(?:"
        + "(?!"
        + SRC_ZCC
        + "|"
        + TEXT_SEPARATORS
        + "|[()[\\]{}.,\"'?!\\-;]).|"
        + "\\[(?:(?!"
        + SRC_ZCC
        + "|\\]).)*\\]|"
        + "\\((?:(?!"
        + SRC_ZCC
        + "|[)]).)*\\)|"
        + "\\{(?:(?!"
        + SRC_ZCC
        + "|[}]).)*\\}|"
        + '\\"(?:(?!'
        + SRC_ZCC
        + '|["]).)+\\"|'
        + "\\'(?:(?!"
        + SRC_ZCC
        + "|[']).)+\\'|"
        + "\\'(?="
        + SRC_PSEUDO_LETTER
        + "|[-])|"
        + "\\.{2,}[a-zA-Z0-9%/&]|"
        # google has many dots in "google search" links (#66, #81).
        # github has ... in commit range links,
        # ReSTRICT to
        # - english
        # - percent-encoded
        # - parts of file path
        # - params separator
        # until more examples found.
        + "\\.(?!"
        + SRC_ZCC
        + "|[.]|$)|"
        + ("\\-(?!--(?:[^-]|$))(?:-*)|" if opts.get("---") else "\\-+|")
        + ",(?!"
        + SRC_ZCC
        + "|$)|"  # allow `,,,` in paths
        + ";(?!"
        + SRC_ZCC
        + "|$)|"  # allow `,,,` in paths
        + "\\!+(?!"
        + SRC_ZCC
        + "|[!]|$)|"  # allow `!!!` in paths, but not at the end
        + "\\?(?!"
        + SRC_ZCC
        + "|[?]|$)"
        + ")+"
        + "|\\/"
        + ")?"
    )

    return src_path


[docs] def build_re(opts): """Build regex Args: opts (dict): options Return: dict: dict of regex string """ SRC_HOST_STRICT = SRC_HOST + _re_host_terminator(opts) TPL_HOST_FUZZY_STRICT = TPL_HOST_FUZZY + _re_host_terminator(opts) SRC_HOST_PORT_STRICT = SRC_HOST + SRC_PORT + _re_host_terminator(opts) TPL_HOST_PORT_FUZZY_STRICT = TPL_HOST_FUZZY + SRC_PORT + _re_host_terminator(opts) TPL_HOST_PORT_NO_IP_FUZZY_STRICT = ( TPL_HOST_NO_IP_FUZZY + SRC_PORT + _re_host_terminator(opts) ) TPL_EMAIL_FUZZY = ( "(^|" + TEXT_SEPARATORS + '|"|\\(|' + SRC_ZCC + ")" + "(" + SRC_EMAIL_NAME + "@" + TPL_HOST_FUZZY_STRICT + ")" ) regex = { "src_Any": SRC_ANY, "src_Cc": SRC_CC, "src_Cf": SRC_CF, "src_Z": SRC_Z, "src_P": SRC_P, "src_ZPCc": SRC_ZPCC, "src_ZCc": SRC_ZCC, "src_pseudo_letter": SRC_PSEUDO_LETTER, "src_ip4": SRC_IP4, "src_auth": SRC_AUTH, "src_port": SRC_PORT, "src_host_terminator": _re_host_terminator(opts), "src_path": _re_src_path(opts), "src_email_name": SRC_EMAIL_NAME, "src_xn": SRC_XN, "src_domain_root": SRC_DOMAIN_ROOT, "src_domain": SRC_DOMAIN, "src_host": SRC_HOST, "tpl_host_fuzzy": TPL_HOST_FUZZY, "tpl_host_no_ip_fuzzy": TPL_HOST_NO_IP_FUZZY, "src_host_strict": SRC_HOST_STRICT, "tpl_host_fuzzy_strict": TPL_HOST_FUZZY_STRICT, "src_host_port_strict": SRC_HOST_PORT_STRICT, "tpl_host_port_fuzzy_strict": TPL_HOST_PORT_FUZZY_STRICT, "tpl_host_port_no_ip_fuzzy_strict": TPL_HOST_PORT_FUZZY_STRICT, # Main rules "tpl_host_fuzzy_test": TPL_HOST_FUZZY_TEST, "tpl_email_fuzzy": TPL_EMAIL_FUZZY, # Fuzzy link can't be prepended with .:/\- and non punctuation. # but can start with > (markdown blockquote) "tpl_link_fuzzy": ( "(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|" + SRC_ZPCC + "))" + "((?![$+<=>^`|\uff5c])" + TPL_HOST_PORT_FUZZY_STRICT + _re_src_path(opts) + ")" ), # Fuzzy link can't be prepended with .:/\- and non punctuation. # but can start with > (markdown blockquote) "tpl_link_no_ip_fuzzy": ( "(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|" + SRC_ZPCC + "))" + "((?![$+<=>^`|\uff5c])" + TPL_HOST_PORT_NO_IP_FUZZY_STRICT + _re_src_path(opts) + ")" ), } return regex