python-表格数据统计

分享一个统计excel的关系统计脚本

1.xlwt操作,合并单元格

原表格:,去计算word1,word2,word3之之间的关系

python-表格数据统计_第1张图片

脚本处理后,可获得每个词语相关的词语的次数

python-表格数据统计_第2张图片

源码:

import xlrd,re,os,xlwt
import operator



def exportExcel(path,field_attr=None):
    #词组
    word_group = set()
    #1.检查表格
    flag,msg = True,'ok'
    if not flag:
        return msg
    else:#通过验证
        # 2.读取excel
        data = xlrd.open_workbook(path)
        now_table = data.sheet_by_index(0)
        # 获得当前表格的行数
        rows_numn = now_table.nrows
        # 将当前的sheet插入到数据库
        for k in range(1, rows_numn):
            row_vlaue = now_table.row_values(k)
            #obj = {}
            # 处理要插入的数据,把非字符串的数据转换成字符串类型,同事将字符串变成 sql语句需要的类型

            for a in range(0, len(row_vlaue)):
                ctype = now_table.cell(k, a).ctype
                # ctype: 0 empty,1 string, 2 number, 3 date, 4 boolean, 5 error
                if ctype == 0 or row_vlaue[a] == '':
                    pass
                else:
                    #print(k,row_vlaue[a])
                    word_group.add(row_vlaue[a].lower())

                #obj[field_attr[a]] = row_vlaue[a]
         #大字典
        big_dict = {}
        for i in word_group:
            big_dict[i] = {}
        for k in range(1, rows_numn):
            row_vlaue = now_table.row_values(k)
            #obj = {}
            # 处理要插入的数据,把非字符串的数据转换成字符串类型,同事将字符串变成 sql语句需要的类型
            row_arr = []
            for a in range(0, len(row_vlaue)):
                ctype = now_table.cell(k, a).ctype
                # ctype: 0 empty,1 string, 2 number, 3 date, 4 boolean, 5 error
                if ctype == 0 or row_vlaue[a] == '':
                    pass
                else:
                    row_arr.append(row_vlaue[a].lower())
            if len(row_arr) == 2:
                a = row_arr[0]
                b = row_arr[1]
                a_word_dict = big_dict[a]
                b_word_dict = big_dict[b]
                if b in a_word_dict.keys():
                    a_word_dict[b] += 1
                else:
                    a_word_dict[b] = 1

                if a in b_word_dict.keys():
                    b_word_dict[a] += 1
                else:
                    b_word_dict[a] = 1
                print(a,b)
            elif len(row_arr) == 3:
                a = row_arr[0]
                b = row_arr[1]
                c = row_arr[2]
                print(a,b,c)
                a_word_dict = big_dict[a]
                b_word_dict = big_dict[b]
                c_word_dict = big_dict[c]
                if b in a_word_dict.keys():
                    a_word_dict[b] += 1
                else:
                    a_word_dict[b] = 1
                if c in a_word_dict.keys():
                    a_word_dict[c] += 1
                else:
                    a_word_dict[c] =1

                if a in b_word_dict.keys():
                    b_word_dict[a] += 1
                else:
                    b_word_dict[a] = 1
                if c in b_word_dict.keys():
                    b_word_dict[c] += 1
                else:
                    b_word_dict[c] = 1

                if a in c_word_dict.keys():
                    c_word_dict[a] += 1
                else:
                    c_word_dict[a] = 1
                if b in c_word_dict.keys():
                    c_word_dict[b] += 1
                else:
                    c_word_dict[b] = 1
            else:pass
        return big_dict



def do_main(path,table_name):
    big_dict = exportExcel(path)
    big_arr = []
    for k, v in big_dict.items():
        v = sorted(v.items(), key=operator.itemgetter(1), reverse=True);
        total = 0
        for i in v:
            total += i[1]
        v = v[:5] if len(v) > 5 else v
        print('total:', total, k, ':', v)
        big_arr.append({'word': k, 'detail': v, 'total': total})

    big_arr = sorted(big_arr, key=lambda obj: obj['total'])

    obj_list = big_arr
    # 1.创建表格
    workbook = xlwt.Workbook()
    # 创建excel的一个sheet
    sheet = workbook.add_sheet(table_name, cell_overwrite_ok=True)

    heads = ['词语','g关系总数','词语关联(前五)']

    sheet.write(0,0,heads[0])
    sheet.write(0,1,heads[1])
    sheet.write_merge(0,0,3,7,heads[2])

    # 4.对象导入表格
    for row in range(len(obj_list)):
        obj = obj_list[len(obj_list) - row - 1]
        print(obj['word'], obj['total'], obj['detail'])
        sheet.write(row + 1, 0, obj['word'])
        sheet.write(row + 1, 1, obj['total'])
        for col in range(0, len(obj['detail'])):
            word = obj['detail'][col]
            sheet.write(row + 1, col + 3, "{0} ({1})".format(word[0], word[1]))

    save_path = table_name+'.xls'
    print('文件路径为' + save_path)
    try:
        workbook.save(save_path)
    except:
        print(save_path + '目录不存在')

    # 6.检查是否导出成功
    if os.path.isfile(save_path):
        print('导出成功,文件为' + save_path)
    else:
        msg = '存储文件发生异常,检查{0}目录是否存在'.format(save_path)
        print(msg)



if __name__ == '__main__':
    path = 'C:\\Users\\SHEIN\Desktop\\relation\\UK.csv'
    table_name = 'UK-result'

    do_main(path, table_name)


 

你可能感兴趣的:(scrapy)