[Python] 读取一个xlsx文件处理并保存成txt文件或binary

最近拿到一个上游的xlsx, 需要自己加工处理取到自己想要的值,需要注意的知识点做个记录:

以下内容基于以下假设:
1, 文件名字为Data_A.xlsxData_B.xlsx, 其内容格式为:

A B C D E F
0x101 10 1 1024
0x201 11 1 117
0x301 12 3 31
0x401 13 2 41
0x501 14 1 126
0x601 15 4 13

0 主要程序框架

def clean():
        current_path = os.getcwd()
        for infile in glob.glob( os.path.join(current_path, '*.csv') ):
            os.remove(infile)

if __name__ == '__main__':
    clean()
    sys.exit()

1 从用户输入的cmd line读取xlsx文件

def parse_input(argv): 
    inputfile = '' 
    try: 
        opts, args = getopt.getopt(sys.argv[1:],"hi:",["help","ifile="])
    except getopt.GetoptError:
        sys.exit(2)
    for opt, arg in opts:
        if opt in ("-h","--help"):
            print('python3 parse_xlsx.py -i ')
            sys.exit()
        elif opt in ("-i", "--ifile"): 
            inputfile = arg
    return inputfile

2 读取xlsx文件为raw.csv文件

def parse_xlsx(file):
    current_path = os.getcwd()
    filepath = current_path+'\\new.csv'
    if (os.path.exists(filepath)) :
        os.remove(filepath)

    rawdf = pd.read_excel(file, sheet_name=0)
    DataFrame(rawdf).to_csv('raw.csv', index=False, header=0)

3 读取raw.csv文件

newdf = pd.read_csv('raw.csv', sep=',', header=0, names=None)

4 将D E列进行处理:以此列的值作为排序依据先进行排序

newdf = pd.read_csv('raw.csv', sep=',', header=0, names=None)

newdf_temp = newdf
if any(newdf_temp['D E'] == 2):
	newdf_order_1 = newdf_temp.loc[newdf_temp['D E'] == 1]
	newdf_order_2 = newdf_temp.loc[newdf_temp['D E'] == 2]
	newdf = newdf_order_1._append(newdf_order_2, ignore_index=True)
	newdf_order_3 = newdf_temp.loc[newdf_temp['D E'] == 3]
	newdf = newdf._append(newdf_order_3, ignore_index=True)
	newdf_order_4 = newdf_temp.loc[newdf_temp['D E'] == 4]
	newdf = newdf._append(newdf_order_4, ignore_index=True)

5 将A列进行处理:去掉0x, 并按8位对齐

newdf['a'] = newdf['A'].astype('str') # the value of A column will handled as string
newdf['a'] = newdf['a'].str.replace(r"0x", "")
newdf_a = newdf['a'].str.zfill(8)

6 将B列进行处理:转成十六进制数, 并按8位对齐

newdf['b'] = newdf['B'].astype('int')
newdf['b'] = newdf['b'].apply( hex )
newdf['b'] = newdf['b'].str.replace(r"0x", "")
newdf_b = newdf['b'].str.zfill(8)

7 将C列进行处理:全部为NaN, 忽略

8 将F列进行处理:转成十六进制数, 并按8位对齐

	newdf['f'] = newdf['F'].astype('int')
	newdf['f'] = newdf['f'].apply( hex )
	newdf['f'] = newdf['f'].str.replace(r"0x", "")
	newdf_f = newdf['f'].str.zfill(8)

9 把上面处理完的ABF列进行拼接,结果存进new.csv

此处要注意拼接的前后顺序,注意是列拼接

newdf = newdf_a + newdf_b + newdf_f
DataFrame(newdf).to_csv('new.csv', index=False, header=False)

10 区分AB两个文件

def distinguish(file):
	if "a".upper() in file.upper():
	    bfile = "A.bin"
	elif "b".upper() in file.upper():
	    bfile = "B.bin"
	else:
	    print("Please confirm input is a or b")

11 将传入的数据每8位作为一组,进行大小端转化

def byteorder_convert(para):
    little_hex = bytearray.fromhex(para)
    i =0
    tmp = bytearray()
    result = bytearray()
    while i < len(little_hex):
        tmp = little_hex[i:i+4]
        tmp.reverse()
        result = result + tmp
        # print(result)
        i += 4
    str_little = "".join(format(x, "02x") for x in result)
    return str_little

12.a 对每一行进行处理,结果存进bfile

def binary(bfile):
    with open('new.csv', 'r') as f:
        reader = csv.reader(f)
        # print(type(reader))
        file = open(bfile, "wb")
        for row in reader:
            row = byteorder_convert(''.join(row))
            # print(row)
            result = ''
            t = '\\x'
            i = 0
            if len(row)%2 == 1:
                print('String length illegal')
            while i < len(row):
                result = result + t + row[i:i+2]
                i = i+2
                s = result
                b = eval('b' + '\'' + s + '\'')
            # print(result)
            # print(b)
            file.write(bytes(b))
        file.close()
        print("Done! Please check", bfile)

12.b 对每一行进行处理,结果存进bfile

def array(file, newdf_a, newdf_b, newdf_f):
    if "a".upper() in file.upper():
        afile = "new_a.c"
    elif "b".upper() in file.upper():
        afile = "new_b.c"
    else:
        print("Please confirm input is a or b")

	newdf = "{"  + ".a = "+ newdf_a + ", .b = " + newdf_b + ", .f = " + newdf_f + "}"
	DataFrame(newdf).to_csv('new.csv', index=False, header=False)

    newdf = pd.read_csv('new.csv', sep=',', header=0, names=None)
    # print(newdf)
    DataFrame(newdf).to_csv(afile, sep=' ', index=False, header=False, quoting=csv.QUOTE_NONE, escapechar=' ')
    print("Done! Please check", afile)

13 删除过程中的csv中间文件

def clean():
        current_path = os.getcwd()
        for infile in glob.glob( os.path.join(current_path, '*.csv') ):
            os.remove(infile)

你可能感兴趣的:(Python,python,开发语言,数据挖掘)