python实现词法分析

#请先安装Ply
# -*- coding: utf-8 -*-

#--------------------------------------------------------------------------

#Author:Jmdebugger

#email: [email protected]

#date: 2013-9-17

#--------------------------------------------------------------------------

import ply.lex as lex



tokens = [

    "TOKEN_IDENT",

    "TOKEN_INT",

    "TOKEN_FLOAT",

    "TOKEN_STRING",

    "TOKEN_OP",

    "TOKEN_DELIM_COMMA",        #,

    "TOKEN_DELIM_OPEN_PAREN",   #(

    "TOKEN_DELIM_CLOSE_PAREN",  #)

    "TOKEN_DELIM_OPEN_BRACKET", #[

    "TOKEN_DELIM_CLOSE_BRACKET",#]

    "TOKEN_DELIM_OPEN_BRACE",   #{

    "TOKEN_DELIM_CLOSE_BRACE",  #}

    "TOKEN_DELIM_SEMICOLON"    #;

    

    ]



reserved = {

   'if'     :   'TOKEN_RSRVD_IF',

   'else'   :   'TOKEN_RSRVD_ELSE',

   'true'   :   'TOKEN_RSRVD_TRUE',

   'false'  :   'TOKEN_RSRVD_FALSE',

   'while'  :   'TOKEN_RSRVD_WHILE',

   'break'  :   'TOKEN_RSRVD_BREAK',

   'continue':  'TOKEN_RSRVD_CONTINUE',

   'goto'   :   'TOKEN_RSRVD_GOTO',

   'func'   :   'TOKEN_RSRVD_FUNC',

   'var'    :   'TOKEN_RSRVD_VAR',

   'for'    :   'TOKEN_RSRVD_FOR',

   'return' :   'TOKEN_RSRVD_RETURN'

}



tokens += reserved .values()





t_ignore = r' \t\r'

def t_COMMENT(t):

    r'(/\*(.|\n)*?\*/)|(\/\/.*)'

    pass



def t_newline(t):

    r'\n+'

    t.lexer.lineno += len(t.value)

    

def t_error(t):

    print "LaunchScript error: "+repr(t.value)

    

def t_TOKEN_IDENT(t):

    r'[a-zA-Z_][a-zA-Z_0-9]*' #标识符

    t.type = reserved.get(t.value , 'TOKEN_IDENT')

    return t



def t_TOKEN_INT(t):

    r'(0x[a-fA-F0-9]+)|([0-9]+)'

    return t



t_TOKEN_FLOAT =  r'[0-9]*\.[0-9]+'

t_TOKEN_STRING = r'(\"([^\\\r]|(\\.))*?\")' #|(\"([^\\\n]|(\\.))*?\")' only for windows

t_TOKEN_DELIM_COMMA = r'\,'

t_TOKEN_DELIM_OPEN_PAREN = r'\('

t_TOKEN_DELIM_CLOSE_PAREN = r'\)'

t_TOKEN_DELIM_OPEN_BRACKET = r'\['

t_TOKEN_DELIM_CLOSE_BRACKET = r'\]'

t_TOKEN_DELIM_OPEN_BRACE = r'\{'

t_TOKEN_DELIM_CLOSE_BRACE = r'\}'

t_TOKEN_DELIM_SEMICOLON = r'\;'



def t_TOKEN_OP(t):

    r'(\<\<\=)|(\>\>\=)|([\+\-\*\/\%\&\|\^\=\!\>\<]\=)|(\|\|)|(\&\&)|(\+\+)|(\-\-)|[\+\-\*\/\%\^\=\&\|\>\<\!\~]'

    return t







    

if __name__ == "__main__":

    lexer = lex.lex()

    f = open("./test.txt" , 'rb')

    data = f.read()

    f.close()

    lexer.input(data)



    while True:

        tok = lexer.token()

        if not tok: break      # No more input

        print tok.value+"\t---->\t"+tok.type



    


你可能感兴趣的:(python)