这篇文章主要讲述,angr中数据流图(Data Flow Gragh)的构建。
DFG恢复的是CFG中每个基本块的数据流!
DFG为CFG的每个基本块构建一个数据流图(DFG)
DFG可以通过字典self.dfgs获得,其中key的值为基本块的地址,或DFG中的值。
param CFG:用于获得所有基本块的CFG
param annocfg:一个由向后片构建的注释cfg,用于在白名单上构建DFG。
构造函数:
def __init__(self, cfg=None, annocfg=None):
"""
Build a Data Flow Grah (DFG) for every basic block of a CFG
The DFGs are available in the dict self.dfgs where the key
is a basic block addr and the value a DFG.
:param cfg: A CFG used to get all the basic blocks
:param annocfg: An AnnotatedCFG built from a backward slice used to only build the DFG on the whitelisted statements
"""
if cfg is None:
self._cfg = self.project.analyses.CFGAccurate()
else:
self._cfg = cfg
self._annocfg = annocfg
self.dfgs = self._construct()
如果没有cfg就构建cfg。
然后,调用_construct()函数构建DFG。这个函数,有点长,不过也是构造数据流的主要函数。下面开始分析吧。
def _construct(self):
"""
We want to build the type of DFG that's used in "Automated Ident. of Crypto
Primitives in Binary Code with Data Flow Graph Isomorphisms." Unlike that
paper, however, we're building it on Vex IR instead of assembly instructions.
"""
cfg = self._cfg
p = self.project
dfgs = {}
l.debug("Building Vex DFG...")
for node in cfg.nodes():#遍历每个节点
try:
if node.simprocedure_name == None:
irsb = p.factory.block(node.addr).vex #根据节点获得irsb
else:
l.debug("Cannot process SimProcedures, ignoring %s" % node.simprocedure_name)
continue
except Exception as e:
l.debug(e)
continue
tmpsnodes = {}
storesnodes = {}
putsnodes = {}
statements = irsb.statements #获取irsb的所有语句
dfg = DiGraph()
for stmt_idx, stmt in enumerate(statements):#遍历每条语句
# We want to skip over certain types, such as Imarks
if self._need_to_ignore(node.addr, stmt, stmt_idx):
continue
# break statement down into sub-expressions
exprs = stmt.expressions #获得语句的子表达式
stmt_node = stmt
dfg.add_node(stmt)
if stmt.tag == 'Ist_WrTmp':
tmpsnodes[stmt.tmp] = stmt_node
if exprs[0].tag == 'Iex_Binop':
if exprs[1].tag == 'Iex_RdTmp':
dfg.add_edge(tmpsnodes[exprs[1].tmp], stmt_node)
else:
dfg.add_edge(exprs[1], stmt_node)
if exprs[2].tag == 'Iex_RdTmp':
dfg.add_edge(tmpsnodes[exprs[2].tmp], stmt_node)
else:
dfg.add_edge(exprs[2], stmt_node)
elif exprs[0].tag == 'Iex_Unop':
dfg.remove_node(stmt_node)
if exprs[1].tag == 'Iex_RdTmp':
tmpsnodes[stmt.tmp] = copy(tmpsnodes[exprs[1].tmp])
tmpsnodes[stmt.tmp].tmp = stmt.tmp
else:
tmpsnodes[stmt.tmp] = exprs[1]
elif exprs[0].tag == 'Iex_RdTmp':
tmpsnodes[stmt.tmp] = copy(tmpsnodes[exprs[0].tmp])
tmpsnodes[stmt.tmp].tmp = stmt.tmp
elif exprs[0].tag == 'Iex_Get':
if putsnodes.has_key(exprs[0].offset):
dfg.add_edge(putsnodes[exprs[0].offset], stmt_node)
if len(exprs) > 1 and exprs[1].tag == "Iex_RdTmp":
dfg.add_edge(tmpsnodes[exprs[1].tmp], stmt_node)
elif len(exprs) > 1:
dfg.add_edge(exprs[1], stmt_node)
elif exprs[0].tag == 'Iex_Load':
if exprs[1].tag == 'Iex_RdTmp':
dfg.add_edge(tmpsnodes[exprs[1].tmp], stmt_node)
else:
dfg.add_edge(exprs[1], stmt_node)
else:
# Take a guess by assuming exprs[0] is the op and any other expressions are args
for e in exprs[1:]:
if e.tag == 'Iex_RdTmp':
dfg.add_edge(tmpsnodes[e.tmp], stmt_node)
else:
dfg.add_edge(e, stmt_node)
elif stmt.tag == 'Ist_Store':
if exprs[0].tag == 'Iex_RdTmp':
dfg.add_edge(tmpsnodes[exprs[0].tmp], stmt_node)
elif exprs[0].tag == 'Iex_Const':
dfg.add_edge(exprs[0], stmt_node)
if exprs[1].tag == 'Iex_RdTmp':
dfg.add_edge(tmpsnodes[exprs[1].tmp], stmt_node)
else:
dfg.add_edge(exprs[1], stmt_node)
elif stmt.tag == 'Ist_Put':
if exprs[0].tag == 'Iex_RdTmp':
dfg.add_edge(tmpsnodes[exprs[0].tmp], stmt_node)
elif exprs[0].tag == 'Iex_Const':
dfg.add_edge(exprs[0], stmt_node)
putsnodes[stmt.offset] = stmt_node
elif stmt.tag == 'Ist_Exit':
if exprs[0].tag == 'Iex_RdTmp':
dfg.add_edge(tmpsnodes[exprs[0].tmp], stmt_node)
elif stmt.tag == 'Ist_Dirty':
tmpsnodes[stmt.tmp] = stmt_node
elif stmt.tag == 'Ist_CAS':
tmpsnodes[stmt.oldLo] = stmt_node
else:
for e in stmt.expressions:
if e.tag == 'Iex_RdTmp':
dfg.add_edge(tmpsnodes[e.tmp], stmt_node)
else:
dfg.add_edge(e, stmt_node)
for vtx in list(dfg.nodes()):
if dfg.degree(vtx) == 0:
dfg.remove_node(vtx)
if dfg.size() > 0:
dfgs[node.addr] = dfg
return dfgs
根据不同statements的类型,标记不同的点。
其实不仅可以用cfg来恢复数据流图,任意的一个block都可以利用这个方法恢复数据流。
唯一的遗憾就是,恢复的数据流是block的,要想恢复函数间的数据流,就应该恢复数据依赖图。
下面是我写的恢复任意一个block的测试代码,当然调用仍然是construct函数。
def main():
proj = angr.Project("test2.bin",load_options={'auto_load_libs':False})
start_addr=0x1F065405
start_state= proj.factory.blank_state(addr=start_addr)
addrs=[start_addr]
dfgs=constructDFG(addrs,proj)
print len(dfgs)
plot_common(dfgs[start_addr],"dfg_1F065405")
constructDFG只是对construct的一点更改
def constructDFG(addrs,project):
dfgs={}
for addr in addrs:
irsb = project.factory.block(addr).vex
if irsb is not None:
tmpsnodes = {}
storesnodes = {}
putsnodes = {}
statements = irsb.statements
dfg = DiGraph()
for stmt_idx, stmt in enumerate(statements):
# break statement down into sub-expressions
exprs = stmt.expressions
stmt_node = stmt
dfg.add_node(stmt)
if stmt.tag == 'Ist_WrTmp':
tmpsnodes[stmt.tmp] = stmt_node
if exprs[0].tag == 'Iex_Binop':
if exprs[1].tag == 'Iex_RdTmp':
dfg.add_edge(tmpsnodes[exprs[1].tmp], stmt_node)
else:
dfg.add_edge(exprs[1], stmt_node)
if exprs[2].tag == 'Iex_RdTmp':
dfg.add_edge(tmpsnodes[exprs[2].tmp], stmt_node)
else:
dfg.add_edge(exprs[2], stmt_node)
elif exprs[0].tag == 'Iex_Unop':
dfg.remove_node(stmt_node)
if exprs[1].tag == 'Iex_RdTmp':
tmpsnodes[stmt.tmp] = copy(tmpsnodes[exprs[1].tmp])
tmpsnodes[stmt.tmp].tmp = stmt.tmp
else:
tmpsnodes[stmt.tmp] = exprs[1]
elif exprs[0].tag == 'Iex_RdTmp':
tmpsnodes[stmt.tmp] = copy(tmpsnodes[exprs[0].tmp])
tmpsnodes[stmt.tmp].tmp = stmt.tmp
elif exprs[0].tag == 'Iex_Get':
if putsnodes.has_key(exprs[0].offset):
dfg.add_edge(putsnodes[exprs[0].offset], stmt_node)
if len(exprs) > 1 and exprs[1].tag == "Iex_RdTmp":
dfg.add_edge(tmpsnodes[exprs[1].tmp], stmt_node)
elif len(exprs) > 1:
dfg.add_edge(exprs[1], stmt_node)
elif exprs[0].tag == 'Iex_Load':
if exprs[1].tag == 'Iex_RdTmp':
dfg.add_edge(tmpsnodes[exprs[1].tmp], stmt_node)
else:
dfg.add_edge(exprs[1], stmt_node)
else:
# Take a guess by assuming exprs[0] is the op and any other expressions are args
for e in exprs[1:]:
if e.tag == 'Iex_RdTmp':
dfg.add_edge(tmpsnodes[e.tmp], stmt_node)
else:
dfg.add_edge(e, stmt_node)
elif stmt.tag == 'Ist_Store':
if exprs[0].tag == 'Iex_RdTmp':
dfg.add_edge(tmpsnodes[exprs[0].tmp], stmt_node)
elif exprs[0].tag == 'Iex_Const':
dfg.add_edge(exprs[0], stmt_node)
if exprs[1].tag == 'Iex_RdTmp':
dfg.add_edge(tmpsnodes[exprs[1].tmp], stmt_node)
else:
dfg.add_edge(exprs[1], stmt_node)
elif stmt.tag == 'Ist_Put':
if exprs[0].tag == 'Iex_RdTmp':
dfg.add_edge(tmpsnodes[exprs[0].tmp], stmt_node)
elif exprs[0].tag == 'Iex_Const':
dfg.add_edge(exprs[0], stmt_node)
putsnodes[stmt.offset] = stmt_node
elif stmt.tag == 'Ist_Exit':
if exprs[0].tag == 'Iex_RdTmp':
dfg.add_edge(tmpsnodes[exprs[0].tmp], stmt_node)
elif stmt.tag == 'Ist_Dirty':
tmpsnodes[stmt.tmp] = stmt_node
elif stmt.tag == 'Ist_CAS':
tmpsnodes[stmt.oldLo] = stmt_node
else:
for e in stmt.expressions:
if e.tag == 'Iex_RdTmp':
dfg.add_edge(tmpsnodes[e.tmp], stmt_node)
else:
dfg.add_edge(e, stmt_node)
for vtx in list(dfg.nodes()):
if dfg.degree(vtx) == 0:
dfg.remove_node(vtx)
if dfg.size() > 0:
dfgs[addr] = dfg
return dfgs
最终画出的图为: