python-二进制读写存取

python的pickle模块可以很方便的进行序列化的操作,将一个Python对象序列化为一个字节流,以便将它保存到一个文件、存储到数据库或者通过网络传输它。

但是当我们直接使用pickle的时候,保存的字节流无法被例如C等其他语言读取。所以Python提供了一个struct模块,用来处理Python数据和C/C#等数据之间的转换。

struct模块的内容不多,也不是太难,下面对其中最常用的方法进行介绍:

struct.pack

struct.pack用于将Python的值根据格式符,转换为字符串(因为Python中没有字节(Byte)类型,可以把这里的字符串理解为字节流,或字节数组)。
其函数原型为:struct.pack(fmt, v1, v2, …),参数fmt是格式字符串,关于格式字符串的相关信息在下面有所介绍。v1, v2, …表示要转换的python值。下面的例子将两个整数转换为字符串(字节流):

import struct

a = 20
b = 400

str = struct.pack("ii", a, b) #转换后的str虽然是字符串类型,但相当于其他语言中的字节流(字节数组),可以在网络上传输
print 'length:', len(str)
print str

格式符”i”表示转换为int,’ii’表示有两个int变量。

struct.unpack

struct.unpack做的工作刚好与struct.pack相反,用于将字节流转换成python数据类型。它的函数原型为:struct.unpack(fmt, string),该函数返回一个元组。 下面是一个简单的例子:

str = struct.pack("ii", 20, 400)
a1, a2 = struct.unpack("ii", str)
print 'a1:', a1
print 'a2:', a2

#---- result:
#a1: 20
#a2: 400

struct.calcsize

struct.calcsize用于计算格式字符串所对应的结果的长度,如:struct.calcsize(‘ii’),返回8。因为两个int类型所占用的长度是8个字节。

关于格式字符串

在Python手册中,给出了C语言中常用类型与Python类型对应的格式符:

Format  C Type          Python type     Standard size 
x           pad byte        no value     
c           char            string of length 1  1
b           signed char     integer             1   
B           unsigned char   integer             1   
?           _Bool           bool                1  
h           short           integer             2   
H           unsigned short  integer             2   
i           int             integer             4   
I           unsigned int    integer             4   
l           long            integer             4   
L           unsigned long   integer             4   
q           long long       integer             8  
Q       unsigned long long  integer             8 
f           float           float               4 
d           double          float               8 
s           char[]          string     
p           char[]          string     
P           void *          integer
格式字符串前加 > 表示大端对齐, < 表示小端对齐

下面是存取二进制文件的代码

# codinf:utf-8
import os
import codecs
import nltk
import struct
import random
import string

def randomString():
    return (''.join(random.sample(string.ascii_letters+string.digits, 16)))
#随机生成长度为16的字符串

#print(randomString())

num = 1
for i in range(num):
    file_sequence = open("train_data/sequence_num_"+str(i), "r")
    f_sequence = file_sequence.readlines()
    for line in f_sequence:
        words = line.split()
        for word in words:
            num_sequence = int(word)
    print(num_sequence)

    file_words = open("train_data/words_num_"+str(i), "r")
    f_words = file_words.readlines()

    file_feature = open("train_data/feature_"+str(i), "r")
    f_feature = file_feature.readlines()

    file_target = open("train_data/target_"+str(i), "r")
    f_target = file_target.readlines()

    #open("text_"+str(i), "wb").write("")
    #open("text_"+str(i), "wt", "uft-8").write("")
    #codecs.open("text_"+str(i), 'wb', 'utf-8').write("abcdefg")
    #with codecs.open("text_"+str(i), 'wb') as out_text:
    num1 = 0
    with open("train_data/text_"+str(i), 'wb') as out_text:
        s = struct.pack("i", num_sequence)
        #s = struct.pack("i", 100)
        out_text.write(s)
        for line_words, line_feature, line_target in zip(f_words, f_feature, f_target):
        #for line_words in f_words:
            words = line_words.split()
            targets = line_target.split()
            features = line_feature.split()
            #print(words)
            #print(targets)
            #print(features)

            for word in words:
                num_word = int(word)
                s1 = randomString() + "_" + str(i) + "_" + str(num1+1)
                s = struct.pack("256sii", bytes(s1, 'utf-8'), num_word, num_word)
                out_text.write(s)

                num2 = 0
                for target in targets:
                    num2 = num2 + 1
                    #print(target)
                    num_target = int(target)
                    s = struct.pack("i", num_target)
                    out_text.write(s)
                #print (num2, num_word)
                assert num2 == num_word

                num2 = 0
                for feature in features:
                    num2 = num2 + 1
                    num_feature = float('%.16f'% float(feature))
                    s = struct.pack("f", num_feature)
                    out_text.write(s)
                #print (num2, num_word)
                assert num2 == num_word

            #print("1111")
            num1 = num1 + 1
            #if num1 == 100:
            #    break
    print(num1, num_sequence)
    assert num1 == num_sequence

        #a, b = struct.unpack("i256s", s)
        #print(a, b)

    file_sequence.close()
    file_words.close()
    file_feature.close()
    file_target.close()
import sys
import struct

file_name = sys.argv[1]
with open(file_name, "rb") as fp:
    file_content = fp.read()
    offset = 4
    number_seq = struct.unpack("i", file_content[:offset])
    print (number_seq)
    for i in range(0, 5):
        name = struct.unpack("256sii", file_content[offset: offset + 256
 + 2 * 4])
        print (name)
        offset += 256 + 4 * 2
        tgt_num = name[2]
        print (name[1], name[2])
        tgt_list = list(struct.unpack("%di" % tgt_num, file_content[offset:offset + 4 * tgt_num]))
        print (tgt_list)
        offset += 4 * tgt_num
        #feq_list = list(struct.unpack("%df" % tgt_num * 153 , file_content[offset:offset + 153 * 4 * tgt_num]) )
        #offset += 4 * 153 * tgt_num
        #print (feq_list[0:153])
        #print (len(feq_list))
        #print (len(tgt_list))
        feq_list = list(struct.unpack("%df" % tgt_num , file_content[offset:offset + 4 * tgt_num]) )
        offset += 4 * tgt_num
        print (feq_list)

你可能感兴趣的:(⭐️python基础知识,python,二进制)