angr源码分析——DFG 数据流图

这篇文章主要讲述,angr中数据流图(Data Flow Gragh)的构建。

DFG恢复的是CFG中每个基本块的数据流!
DFG为CFG的每个基本块构建一个数据流图(DFG)
DFG可以通过字典self.dfgs获得,其中key的值为基本块的地址,或DFG中的值。
param CFG:用于获得所有基本块的CFG

param annocfg:一个由向后片构建的注释cfg,用于在白名单上构建DFG。

构造函数:

def __init__(self, cfg=None, annocfg=None):
        """
        Build a Data Flow Grah (DFG) for every basic block of a CFG
        The DFGs are available in the dict self.dfgs where the key
        is a basic block addr and the value a DFG.
        :param cfg: A CFG used to get all the basic blocks
        :param annocfg: An AnnotatedCFG built from a backward slice used to only build the DFG on the whitelisted statements
        """
        if cfg is None:
            self._cfg = self.project.analyses.CFGAccurate()
        else:
            self._cfg = cfg
        self._annocfg = annocfg

        self.dfgs = self._construct()

如果没有cfg就构建cfg。

然后,调用_construct()函数构建DFG。这个函数,有点长,不过也是构造数据流的主要函数。下面开始分析吧。

def _construct(self):
        """
        We want to build the type of DFG that's used in "Automated Ident. of Crypto
        Primitives in Binary Code with Data Flow Graph Isomorphisms." Unlike that
        paper, however, we're building it on Vex IR instead of assembly instructions.
        """
        cfg = self._cfg
        p = self.project
        dfgs = {}
        l.debug("Building Vex DFG...")

        for node in cfg.nodes():#遍历每个节点
            try:
                if node.simprocedure_name == None:
                    irsb = p.factory.block(node.addr).vex #根据节点获得irsb
                else:
                    l.debug("Cannot process SimProcedures, ignoring %s" % node.simprocedure_name)
                    continue
            except Exception as e:
                l.debug(e)
                continue
            tmpsnodes = {}
            storesnodes = {}
            putsnodes = {}
            statements = irsb.statements #获取irsb的所有语句
            dfg = DiGraph()

            for stmt_idx, stmt in enumerate(statements):#遍历每条语句
                # We want to skip over certain types, such as Imarks
                if self._need_to_ignore(node.addr, stmt, stmt_idx):
                    continue

                # break statement down into sub-expressions
                exprs = stmt.expressions #获得语句的子表达式
                stmt_node = stmt
                dfg.add_node(stmt)

                if stmt.tag == 'Ist_WrTmp':
                    tmpsnodes[stmt.tmp] = stmt_node
                    if exprs[0].tag == 'Iex_Binop':
                        if exprs[1].tag == 'Iex_RdTmp':
                            dfg.add_edge(tmpsnodes[exprs[1].tmp], stmt_node)
                        else:
                            dfg.add_edge(exprs[1], stmt_node)
                        if exprs[2].tag == 'Iex_RdTmp':
                            dfg.add_edge(tmpsnodes[exprs[2].tmp], stmt_node)
                        else:
                            dfg.add_edge(exprs[2], stmt_node)

                    elif exprs[0].tag == 'Iex_Unop':
                        dfg.remove_node(stmt_node)
                        if exprs[1].tag == 'Iex_RdTmp':
                            tmpsnodes[stmt.tmp] = copy(tmpsnodes[exprs[1].tmp])
                            tmpsnodes[stmt.tmp].tmp = stmt.tmp
                        else:
                            tmpsnodes[stmt.tmp] = exprs[1]

                    elif exprs[0].tag == 'Iex_RdTmp':
                        tmpsnodes[stmt.tmp] = copy(tmpsnodes[exprs[0].tmp])
                        tmpsnodes[stmt.tmp].tmp = stmt.tmp

                    elif exprs[0].tag == 'Iex_Get':
                        if putsnodes.has_key(exprs[0].offset):
                            dfg.add_edge(putsnodes[exprs[0].offset], stmt_node)
                        if len(exprs) > 1 and exprs[1].tag == "Iex_RdTmp":
                            dfg.add_edge(tmpsnodes[exprs[1].tmp], stmt_node)
                        elif len(exprs) > 1:
                            dfg.add_edge(exprs[1], stmt_node)

                    elif exprs[0].tag == 'Iex_Load':
                        if exprs[1].tag == 'Iex_RdTmp':
                            dfg.add_edge(tmpsnodes[exprs[1].tmp], stmt_node)
                        else:
                            dfg.add_edge(exprs[1], stmt_node)

                    else:
                        # Take a guess by assuming exprs[0] is the op and any other expressions are args
                        for e in exprs[1:]:
                            if e.tag == 'Iex_RdTmp':
                                dfg.add_edge(tmpsnodes[e.tmp], stmt_node)
                            else:
                                dfg.add_edge(e, stmt_node)

                elif stmt.tag == 'Ist_Store':
                    if exprs[0].tag == 'Iex_RdTmp':
                        dfg.add_edge(tmpsnodes[exprs[0].tmp], stmt_node)

                    elif exprs[0].tag == 'Iex_Const':
                        dfg.add_edge(exprs[0], stmt_node)

                    if exprs[1].tag == 'Iex_RdTmp':
                        dfg.add_edge(tmpsnodes[exprs[1].tmp], stmt_node)
                    else:
                        dfg.add_edge(exprs[1], stmt_node)

                elif stmt.tag == 'Ist_Put':
                    if exprs[0].tag == 'Iex_RdTmp':
                        dfg.add_edge(tmpsnodes[exprs[0].tmp], stmt_node)
                    elif exprs[0].tag == 'Iex_Const':
                        dfg.add_edge(exprs[0], stmt_node)
                    putsnodes[stmt.offset] = stmt_node

                elif stmt.tag == 'Ist_Exit':
                    if exprs[0].tag == 'Iex_RdTmp':
                        dfg.add_edge(tmpsnodes[exprs[0].tmp], stmt_node)

                elif stmt.tag == 'Ist_Dirty':
                    tmpsnodes[stmt.tmp] = stmt_node
                elif stmt.tag == 'Ist_CAS':
                    tmpsnodes[stmt.oldLo] = stmt_node

                else:
                    for e in stmt.expressions:
                        if e.tag == 'Iex_RdTmp':
                            dfg.add_edge(tmpsnodes[e.tmp], stmt_node)
                        else:
                            dfg.add_edge(e, stmt_node)

            for vtx in list(dfg.nodes()):
                if dfg.degree(vtx) == 0:
                    dfg.remove_node(vtx)

            if dfg.size() > 0:
                dfgs[node.addr] = dfg
        return dfgs

根据不同statements的类型,标记不同的点。

其实不仅可以用cfg来恢复数据流图,任意的一个block都可以利用这个方法恢复数据流。

唯一的遗憾就是,恢复的数据流是block的,要想恢复函数间的数据流,就应该恢复数据依赖图。

下面是我写的恢复任意一个block的测试代码,当然调用仍然是construct函数。

def main():
    proj = angr.Project("test2.bin",load_options={'auto_load_libs':False})
    start_addr=0x1F065405
    start_state= proj.factory.blank_state(addr=start_addr)
    addrs=[start_addr]
    dfgs=constructDFG(addrs,proj)
    print len(dfgs)
    plot_common(dfgs[start_addr],"dfg_1F065405")

constructDFG只是对construct的一点更改

def constructDFG(addrs,project):
    dfgs={}
    for addr in addrs:
        irsb = project.factory.block(addr).vex
        if irsb is not None:
            tmpsnodes = {}
            storesnodes = {}
            putsnodes = {}
            statements = irsb.statements 
            dfg = DiGraph()

            for stmt_idx, stmt in enumerate(statements): 

                # break statement down into sub-expressions 
                exprs = stmt.expressions 
                stmt_node = stmt 
                dfg.add_node(stmt)

                if stmt.tag == 'Ist_WrTmp': 
                    tmpsnodes[stmt.tmp] = stmt_node 
                    if exprs[0].tag == 'Iex_Binop':
                        if exprs[1].tag == 'Iex_RdTmp':
                            dfg.add_edge(tmpsnodes[exprs[1].tmp], stmt_node)
                        else:
                            dfg.add_edge(exprs[1], stmt_node)
                        if exprs[2].tag == 'Iex_RdTmp':
                            dfg.add_edge(tmpsnodes[exprs[2].tmp], stmt_node)
                        else:
                            dfg.add_edge(exprs[2], stmt_node)

                    elif exprs[0].tag == 'Iex_Unop':
                        dfg.remove_node(stmt_node)
                        if exprs[1].tag == 'Iex_RdTmp':
                            tmpsnodes[stmt.tmp] = copy(tmpsnodes[exprs[1].tmp])
                            tmpsnodes[stmt.tmp].tmp = stmt.tmp
                        else:
                            tmpsnodes[stmt.tmp] = exprs[1]

                    elif exprs[0].tag == 'Iex_RdTmp':
                        tmpsnodes[stmt.tmp] = copy(tmpsnodes[exprs[0].tmp])
                        tmpsnodes[stmt.tmp].tmp = stmt.tmp

                    elif exprs[0].tag == 'Iex_Get':
                        if putsnodes.has_key(exprs[0].offset):
                            dfg.add_edge(putsnodes[exprs[0].offset], stmt_node)
                        if len(exprs) > 1 and exprs[1].tag == "Iex_RdTmp":
                            dfg.add_edge(tmpsnodes[exprs[1].tmp], stmt_node)
                        elif len(exprs) > 1:
                            dfg.add_edge(exprs[1], stmt_node)

                    elif exprs[0].tag == 'Iex_Load':
                        if exprs[1].tag == 'Iex_RdTmp':
                            dfg.add_edge(tmpsnodes[exprs[1].tmp], stmt_node)
                        else:
                            dfg.add_edge(exprs[1], stmt_node)

                    else:
                        # Take a guess by assuming exprs[0] is the op and any other expressions are args
                        for e in exprs[1:]:
                            if e.tag == 'Iex_RdTmp':
                                dfg.add_edge(tmpsnodes[e.tmp], stmt_node)
                            else:
                                dfg.add_edge(e, stmt_node)

                elif stmt.tag == 'Ist_Store':
                    if exprs[0].tag == 'Iex_RdTmp':
                        dfg.add_edge(tmpsnodes[exprs[0].tmp], stmt_node)

                    elif exprs[0].tag == 'Iex_Const':
                        dfg.add_edge(exprs[0], stmt_node)

                    if exprs[1].tag == 'Iex_RdTmp':
                        dfg.add_edge(tmpsnodes[exprs[1].tmp], stmt_node)
                    else:
                        dfg.add_edge(exprs[1], stmt_node)

                elif stmt.tag == 'Ist_Put':
                    if exprs[0].tag == 'Iex_RdTmp':
                        dfg.add_edge(tmpsnodes[exprs[0].tmp], stmt_node)
                    elif exprs[0].tag == 'Iex_Const':
                        dfg.add_edge(exprs[0], stmt_node)
                    putsnodes[stmt.offset] = stmt_node

                elif stmt.tag == 'Ist_Exit':
                    if exprs[0].tag == 'Iex_RdTmp':
                        dfg.add_edge(tmpsnodes[exprs[0].tmp], stmt_node)

                elif stmt.tag == 'Ist_Dirty':
                    tmpsnodes[stmt.tmp] = stmt_node
                elif stmt.tag == 'Ist_CAS':
                    tmpsnodes[stmt.oldLo] = stmt_node

                else:
                    for e in stmt.expressions:
                        if e.tag == 'Iex_RdTmp':
                            dfg.add_edge(tmpsnodes[e.tmp], stmt_node)
                        else:
                            dfg.add_edge(e, stmt_node)

            for vtx in list(dfg.nodes()):
                if dfg.degree(vtx) == 0:
                    dfg.remove_node(vtx)

            if dfg.size() > 0:
                dfgs[addr] = dfg
        return dfgs

最终画出的图为:

angr源码分析——DFG 数据流图_第1张图片

你可能感兴趣的:(漏洞挖掘)