# -*- coding: utf-8 -*-
'''
A lexical grammar for CSS.
'''

import re

import ply.lex as lex
from ply.lex import TOKEN

# re helpers

def r_nongroup(rx):
    '''non-capturing group'''
    return r'(?:' + rx + r')'

def r_or(*rxs):
    return r_nongroup(r'|'.join([r_nongroup(x) for x in rxs]))

def r_star(rx):
    return r_nongroup(rx) + r'*'

def r_plus(rx):
    return r_nongroup(rx) + r'+'

def r_opt(rx):
    return r_nongroup(rx) + r'?'


softsp    = r_opt(r_or(r'\r\n', r'[ \t\r\n\f]'))

ws        = r'[ \t\r\n\f]+'
wsopt     = r_opt(ws)
nl        = r'\n|\r\n|\r|\f'

h         = r'[0-9a-fA-F]'
nonascii  = r'[^\0-\177]'
unicode   = r'\\' + h + r'{1,6}' + softsp
escape    = r_or(unicode, r'\\[^\r\n\f0-9a-fA-F]')
nmstart   = r_or(r'[_a-zA-Z]', nonascii, escape)
nmchar    = r_or(r'[_a-zA-Z0-9-]', nonascii, escape)
string1   = r'"%s"' % r_star(r_or(r'[^\n\r\f\\"]', r'\\' + nl, escape))
string2   = r"'%s'" % r_star(r_or(r"[^\n\r\f\\']", r'\\' + nl, escape))
invalid1  = r'"%s' % r_star(r_or(r'[^\n\r\f\\"]', r'\\' + nl, escape))
invalid2  = r"'%s" % r_star(r_or(r"[^\n\r\f\\']", r'\\' + nl, escape))

comment   = r'\/\*[^*]*\*+(?:[^/][^*]*\*+)*\/'
comment   = r'\/\*' + r_star(r'[^*]') + r_plus(r'\*') + r_star(r'[^/]' + r_star(r'[^*]') + r_plus(r'\*')) + r'\/'

ident     = r_opt(r'-') + nmstart + r_star(nmchar)
name      = r_plus(nmchar)
num       = r_or(r_star(r'[0-9]') + r'\.' + r_plus(r'[0-9]'), r_plus(r'[0-9]'))
string    = r_or(string1, string2)
invalid   = r_or(invalid1, invalid2)
url       = r_star(r_or(r'[!#$%&*-~]', nonascii, escape))

def letter(c):
    return r_or(c.lower(), r'\\0{0,4}' + r_or(hex(ord(c.upper()))[2:], hex(ord(c.lower()))[2:]) + softsp)

def normalize(x):
    '''normalizes escaped characters to their literal value.'''
    p = r'\\0{0,4}([0-9]{2})'
    r = lambda m: chr(int(m.groups()[0], 16))
    return re.sub(p,r,x).lower()

A = letter('A')
C = letter('C')
D = letter('D')
E = letter('E')
G = letter('G')
H = letter('H')
I = letter('I')
K = letter('K')
L = letter('L')
M = letter('M')
N = letter('N')
O = letter('O')
P = letter('P')
R = letter('R')
S = letter('S')
T = letter('T')
U = letter('U')
X = letter('X')
Z = letter('Z')

tokens = (
    'SPACE',
    'COMMENT',
    'CDO',
    'CDC',
    'VARPREFIX',
    'INCLUDES',
    'DASHMATCH',
    'PLUS',
    'GREATER',
    'COMMA',
    'COLON',
    'SEMI',
    'ASTERIX',
    'CIRCUM',
    'MINUS',
    'DOT',
    'EQUAL',
    'SLASH',
    'LPAREN',
    'RPAREN',
    'LBRACKET',
    'RBRACKET',
    'LBRACE',
    'RBRACE',
    'STRING',
    'INVALID',
    'IDENT',
    'HASH',
    'IMPORT_SYM',
    'PAGE_SYM',
    'MEDIA_SYM',
    'CHARSET_SYM',
    'FONTFACE_SYM',
    'KEYFRAMES_SYM',
    'IMPORTANT_SYM',
    'EMS',
    'EXS',
    'LENGTH',
    'ANGLE',
    'TIME',
    'FREQ',
    'DIMENSION',
    'PERCENTAGE',
    'NUMBER',
    'URI',
    'FUNCTION'
)

# several of the following are defined as functions rather
# than simple rules so that tokenizing precedence works properly,
# i.e. lengths, etc. are not parsed as dimensions

def t_SPACE(t):
    r'[ \t\r\n]+'
    t.lexer.lineno += t.value.count('\n')
    return t

@TOKEN(comment)
def t_COMMENT(t):
    t.lexer.lineno += len(re.findall(nl, t.value))
    return t

t_CDO          = r'\<\!\-\-'
t_CDC          = r'\-\-\>'
t_INCLUDES     = r'\~\='
t_DASHMATCH    = r'\|\='
t_VARPREFIX    = r'\-\-'
t_PLUS         = r'\+'
t_GREATER      = r'\>'
t_COMMA        = r'\,'
t_COLON        = r'\:'
t_SEMI         = r'\;'
t_ASTERIX      = r'\*'
t_CIRCUM       = r'\^'
t_MINUS        = r'\-'
t_DOT          = r'\.'
t_EQUAL        = r'\='
t_SLASH        = r'/'
t_LPAREN       = r'\('
t_RPAREN       = r'\)'
t_LBRACKET     = r'\['
t_RBRACKET     = r'\]'
t_LBRACE       = r'\{'
t_RBRACE       = r'\}'

@TOKEN(string)
def t_STRING(t):
    t.lexer.lineno += len(re.findall(nl, t.value))
    return t

@TOKEN(invalid)
def t_INVALID(t):
    t.lexer.lineno += len(re.findall(nl, t.value))
    return t

t_IDENT = ident

t_HASH         = r'\#' + name

t_IMPORT_SYM   = r'@' + I + M + P + O + R + T
t_PAGE_SYM     = r'@' + P + A + G + E
t_MEDIA_SYM    = r'@' + M + E + D + I + A

t_CHARSET_SYM = r'@charset'
t_FONTFACE_SYM = r'@font-face'
t_KEYFRAMES_SYM = r'@keyframes'
t_IMPORTANT_SYM = r'\!' + r_star(r_or(ws, comment)) + I + M + P + O + R + T + A + N + T

@TOKEN(num + E + M)
def t_EMS(t):
    return t

@TOKEN(num + E + X)
def t_EXS(t):
    return t

@TOKEN(num + r_or(P + X, r_or(C, M) + M, I + N, P + r_or(T, C)))
def t_LENGTH(t):
    return t

@TOKEN(num + r_or(D + E + G, r_opt(G) + R + A + D))
def t_ANGLE(t):
    return t

@TOKEN(num + r_opt(M) + S)
def t_TIME(t):
    return t

@TOKEN(num + r_opt(K) + H + Z)
def t_FREQ(t):
    return t

@TOKEN(num + ident)
def t_DIMENSION(t):
    return t

@TOKEN(num + r'%')
def t_PERCENTAGE(t):
    return t

t_NUMBER = num

@TOKEN(U + R + L + r'\(' + wsopt + r_or(string, url) + wsopt + r'\)')
def t_URI(t):
    return t

@TOKEN(ident + r'\(')
def t_FUNCTION(t):
    return t

# restore after any errors by issuing an "error" token

def calc_column(input, lexpos):
    lineStart = input.rfind('\n', 0, lexpos) + 1
    return (lexpos - lineStart) + 1

def t_ANY_error(t):
    print('csslex: Illegal character {0} at {1}:{2}'.format(repr(t.value[0]), t.lineno, calc_column(t.lexer.lexdata, t.lexpos)))
    t.lexer.begin('INITIAL')
    t.value = t.value[0]
    t.lexer.skip(1)
    return t
