因为工作需要所以需要将文件用二进制方式读取。网上资料比较少,所以我再次把自己的实践过程mark一下。
并且由于代码可能需要在不同的机器上运行,所以还需要考虑一下大小端转换问题。
先看下面的代码,然后我在简短的解释一下。
#include
#include
#include
#include
#define nmemb 7
/****************************************************
Date types(Compiler specific) 数据类型(和编译器相关)
*****************************************************/
typedef unsigned char uint8; /* Unsigned 8 bit quantity */
typedef signed char int8; /* Signed 8 bit quantity */
typedef unsigned short uint16; /* Unsigned 16 bit quantity */
typedef signed short int16; /* Signed 16 bit quantity */
typedef unsigned int uint32; /* Unsigned 32 bit quantity */
typedef signed int int32; /* Signed 32 bit quantity */
typedef float fp32; /* Single precision */
/* floating point */
typedef double fp64; /* Double precision */
/* floating point */
//int32
#define BigtoLittle32(A) ((( (uint32)(A) & 0xff000000 ) >> 24) | \
(( (uint32)(A) & 0x00ff0000 ) >> 8) | \
(( (uint32)(A) & 0x0000ff00 ) << 8) | \
(( (uint32)(A) & 0x000000ff ) << 24))
//int16
#define BigtoLittle16(A) (( ((uint16)(A) & 0xff00) >> 8 ) | \
(( (uint16)(A) & 0x00ff ) << 8))
/************************************************************
* Conversion little endian float data to big endian
* *************************************************************/
float ReverseFloat(const float inFloat)
{
float retVal;
char *floatToConvert = (char*) & inFloat;
char *returnFloat = (char*) & retVal;
// swap the bytes into a temporary buffer
returnFloat[0] = floatToConvert[3];
returnFloat[1] = floatToConvert[2];
returnFloat[2] = floatToConvert[1];
returnFloat[3] = floatToConvert[0];
return retVal;
}
struct matrix
{
int row;
int column;
}s[nmemb];
void set_s(int j, int x, int y)
{
s[j].row = x;
s[j].column = y;
}
bool is_bigendian()
{
int a = 0x1234;
char b = *(char *)&a; //b == the Low address part of a
//printf("%c\n", b);
if (b == 0x34) {
return false;
}
return true;
}
int main()
{
if (is_bigendian()) {
printf("BigEndian\n");
} else {
printf("LittleEndian\n");
}
FILE *fp;
set_s(0, 1, 50);
set_s(1, 1, 80);
set_s(2, 4, 20);
set_s(3, 50, 1);
set_s(4, 80, 2);
set_s(5, 100, 3);
set_s(6, 100, 4);
int ans = sizeof(struct matrix);
printf("size: %d\n", ans);
printf("size: %d\n", sizeof(s));
if ((fp = fopen("test", "wb")) == NULL) {
printf("EROOR\n");
return 1;
}
for (int j = 0; j < nmemb; ++j) {
printf("row: %d column: %d\n", s[j].row, s[j].column);
}
fwrite(s, sizeof(struct matrix), nmemb, fp);
for (int i = 0; i < nmemb; ++i) {
float *m = (float*) malloc(sizeof(float) * s[i].row * s[i].column);
bzero(m, sizeof(float) * s[i].row * s[i].column);
for (int j = 0; j < s[i].row; ++j) {
for (int k = 0; k < s[i].column; ++k) {
m[k + j*s[i].column] = k;
}
}
fwrite(m, sizeof(float), s[i].row * s[i].column, fp);
free(m);
}
fclose(fp);
printf("11\n");
/*
printf("%d\n", sizeof(float));
FILE *fp;
if ((fp = fopen("test", "rb")) == NULL) {
printf("EROOR\n");
return 1;
}
fread(s, sizeof(struct matrix), nmemb, fp);
for (int i = 0; i < nmemb; ++i) {
printf("row: %d column: %d\n", s[i].row, s[i].column);
}
for (int i = 0; i < nmemb; ++i) {
float *m = (float*) malloc(sizeof(float) * s[i].row * s[i].column);
bzero(m, sizeof(float) * s[i].row * s[i].column);
fread(m, sizeof(float), s[i].row * s[i].column, fp);
for (int j = 0; j < s[i].row; ++j) {
for (int k = 0; k < s[i].column; ++k) {
printf("%lf ", m[k + j*s[i].column]);
}
printf("\n");
}
printf("\n\n");
free(m);
}
fclose(fp);
*/
return 0;
}
fopen和fclose是很常见的,在这里就不做解释了。我们来看看fwrite和fread,本来以为这个很麻烦,但是用过之后发现这个二进制文件读写才是最简单的。
size_t fwrite(const void * ptr,size_t size,size_t nmemb,FILE * stream);
fwrite()用来将数据写入文件流中。
stream为已打开的文件指针
ptr 指向欲写入的数据地址
写入的字符数以参数size*nmemb来决定。
size表示写入一个nmemb的内存大小。
fwrite()会返回实际写入的nmemb数目。
size_t fread(void * ptr,size_t size,size_t nmemb,FILE * stream);
fread()用来从文件流中读取数据。
stream为已打开的文件指针
ptr 指向欲存放读取进来的数据空间
读取的字符数以参数size*nmemb来决定
size表示读取一个nmemb的内存大小。
fread()会返回实际读取到的nmemb数目,如果此值比参数nmemb 小,则代表可能读到了文件尾或有错误发生,这时必须用feof()或ferror()来决定发生什么情况。
返回实际读取到的nmemb数目。
详情参见上面的代码。
另外就是大小端的问题了。关于大小端的具体解释网上有很多,在此不作解释。参考上面写的代码,我判断了自己机器是大端还是小端,并且实现了int16,int32已经float数据类型的大小端转换,大端转小端,在使用相同的代码一次小端又变成了大端。
PS:float的大小端转化我之前一直以为写的是错的,因为好多数据转化之后输出都是0。后来发现可能是与float类型在内存中的存放有关,我们的程序是对的。
下面是写的比较详细的二进制读和写的函数。包括数值类型,struct,char*等的读写。
bool save_binary(char* file_path)
{
FILE *fp;
if ((fp = fopen(file_path, "wb")) == NULL) {
printf("EROOR\n");
return false;
}
fwrite(&vocab_total_size, sizeof(int), 1, fp);
FILE *fp1;
if ((fp1 = fopen("../out/vocab", "r")) == NULL) {
printf("No vocab file!\n");
return false;
}
char str[100];
int coun = 0;
while ((fscanf(fp1, "%s", str)) != EOF) {
int len = strlen(str);
coun++;
printf("%d %d %s\n", coun, len, str);
fwrite(&len, sizeof(int), 1, fp);
fwrite(str, sizeof(char), len, fp);
}
fclose(fp1);
//binary head, save the row and column of the matrixs
struct matrix
{
int row;
int column;
}s[NMEMB];
s[0].row = vocab_total_size; s[0].column = projection_size;
s[1].row = projection_size; s[1].column = hidden_size*4;
s[2].row = 1; s[2].column = hidden_size;
s[3].row = 1; s[3].column = hidden_size;
s[4].row = 1; s[4].column = hidden_size;
s[5].row = hidden_size; s[5].column = hidden_size*4;
s[6].row = hidden_size; s[6].column = punc_total_size;
s[7].row = vocab_total_size; s[7].column = projection_size;
s[8].row = projection_size; s[8].column = hidden_size*4;
s[9].row = 1; s[9].column = hidden_size;
s[10].row = 1; s[10].column = hidden_size;
s[11].row = 1; s[11].column = hidden_size;
s[12].row = hidden_size; s[12].column = hidden_size*4;
s[13].row = hidden_size; s[13].column = punc_total_size;
fwrite(s, sizeof(struct matrix), NMEMB, fp);
fwrite(final_We, sizeof(float), s[0].row * s[0].column, fp);
fwrite(final_W, sizeof(float), s[1].row * s[1].column, fp);
fwrite(final_Wip, sizeof(float), s[2].row * s[2].column, fp);
fwrite(final_Wfp, sizeof(float), s[3].row * s[3].column, fp);
fwrite(final_Wop, sizeof(float), s[4].row * s[4].column, fp);
fwrite(final_Wr, sizeof(float), s[5].row * s[5].column, fp);
fwrite(final_Wy, sizeof(float), s[6].row * s[6].column, fp);
fwrite(final_We_hg, sizeof(float), s[7].row * s[7].column, fp);
fwrite(final_W_hg, sizeof(float), s[8].row * s[8].column, fp);
fwrite(final_Wip_hg, sizeof(float), s[9].row * s[9].column, fp);
fwrite(final_Wfp_hg, sizeof(float), s[10].row * s[10].column, fp);
fwrite(final_Wop_hg, sizeof(float), s[11].row * s[11].column, fp);
fwrite(final_Wr_hg, sizeof(float), s[12].row * s[12].column, fp);
fwrite(final_Wy_hg, sizeof(float), s[13].row * s[13].column, fp);
fclose(fp);
return true;
}
bool load_binary(char *file_path)
{
FILE *fp;
if ((fp = fopen(file_path, "rb")) == NULL) {
printf("EROOR\n");
return false;
}
int vocab_size;
fread(&vocab_size, sizeof(int), 1, fp);
printf("%d\n", vocab_size);
int coun = 0;
for (int j = 0; j < vocab_size; ++j) {
int len;
coun++;
fread(&len, sizeof(int), 1, fp);
char str[100];
fread(str, sizeof(char), len, fp);
str[len] = '\0'; //一定要加,不加出错
}
printf("%d\n", coun);
struct matrix
{
int row;
int column;
}s[NMEMB];
fread(s, sizeof(struct matrix), NMEMB, fp);
for (int i = 0; i < NMEMB; ++i) {
printf("row: %d column: %d\n", s[i].row, s[i].column);
}
fread(We, sizeof(float), s[0].row * s[0].column, fp);
fread(W, sizeof(float), s[1].row * s[1].column, fp);
fread(Wip, sizeof(float), s[2].row * s[2].column, fp);
fread(Wfp, sizeof(float), s[3].row * s[3].column, fp);
fread(Wop, sizeof(float), s[4].row * s[4].column, fp);
fread(Wr, sizeof(float), s[5].row * s[5].column, fp);
fread(Wy, sizeof(float), s[6].row * s[6].column, fp);
fread(We_hg, sizeof(float), s[7].row * s[7].column, fp);
fread(W_hg, sizeof(float), s[8].row * s[8].column, fp);
fread(Wip_hg, sizeof(float), s[9].row * s[9].column, fp);
fread(Wfp_hg, sizeof(float), s[10].row * s[10].column, fp);
fread(Wop_hg, sizeof(float), s[11].row * s[11].column, fp);
fread(Wr_hg, sizeof(float), s[12].row * s[12].column, fp);
fread(Wy_hg, sizeof(float), s[13].row * s[13].column, fp);
fclose(fp);
for (int j = 0; j < s[3].row; ++j) {
for (int k = 0; k < s[3].column; ++k) {
printf("%f ",Wfp[k + j*s[3].column]);
}
printf("\n");
}
return true;
}