相信经常做数据分析的常常需要对数据进行放大:比如说现在只有100个数据,现在需要10000个数据才能更好的分析问题,这时候就需要用到数据放大.
需求:下面是900多个数据,把他放大到10000个.
vim 打开如下图:
一部分代码:
40.095,28,29,2545,10790,1
40.833,29,34,2843,11940,1
40.003,31,31,2948,11651,1
40.173,27,27,2851,11751,1
40.029,27,26,2657,10655,1
39.997,28,28,2849,11004,1
40.324,30,28,2876,11689,1
40.584,29,28,2756,11797,1
40.161,29,27,2924,11045,1
41.554,30,30,2929,11114,1
40.977,31,31,2672,10642,1
40.601,33,28,2829,10840,1
41.398,29,27,2623,10718,1
40.725,30,26,2849,11373,1
41.435,28,30,2933,11890,1
41.034,27,30,2920,11496,1
40.677,29,29,2964,10811,1
40.981,27,29,2833,10657,1
39.994,27,27,2684,10946,1
40.489,29,27,2695,10689,1
39.924,31,30,2992,10861,1
解决思路是:先算出各个成员的平均值,最大值,最小值.
然后用最小值,最小值或者平均值加上一个可调节的随机值,让它与原数据拟合程度更高.
#include
#include
#include
typedef struct dataitem{
float time_during;
int first_int;
int second_int;
int third_int;
int fourth_int;
int flag;
}DATA;
DATA function(char*);
int main()
{
FILE* fp = NULL;
char* name = "bot200.csv";
fp = fopen(name,"rb+");
if(!fp)
{
printf("error!");
exit(-1);
}
DATA item;
char string[32];
double time_during = 0.0;
float first_int = 0;
float second_int = 0;
float third_int = 0;
float fourth_int = 0;
float flag = 0;
int i = 0;
for(i = 0;i < 901;i++)
{
fgets(string,sizeof string,fp);
item = function(string);
time_during += item.time_during;
first_int += item.first_int;
second_int += item.second_int;
flag += item.flag;
}
printf("time = %lf,first_int = %f,second_int = %f,third_int = %f,fourth_int = %f,flag = %f\n",time_during/i,first_int/i,second_int/i,third_int/i,fourth_int/i,flag/i);
fclose(fp);
return 0;
}
DATA function(char* string)
{
DATA Temp;
char str[10];
char* p = strtok(string,",");
strcpy(str,p);
int strint = 0,sum = 0;
for(int i = 0;i < 6;i++)
{
if(str[i] == '.')
continue;
strint = str[i]-48;
sum=sum*10+strint;
}
float time_during = sum/1000.000;
//printf("%f",time_during);
p = strtok(NULL,",");
int first_int = atoi(p);
//printf("%d\t",third_int);
p = strtok(NULL,",");
int fourth_int = atoi(p);
//printf("%d\t",fourth_int);
p = strtok(NULL,",");
int flag = atoi(p);
//printf("%d\n",flag);
Temp.time_during = time_during;
Temp.first_int = first_int;
Temp.second_int = second_int;
Temp.third_int= third_int;
Temp.fourth_int = fourth_int;
Temp.flag = flag;
return Temp;
}
算出几个关键值,然后根据关键值生成需要的数据
代码如下:
#include
#include
#include
int main()
{
FILE* fp = NULL;
char* name = "bot200-1.csv";
fp = fopen(name,"a+");
if(!fp)
{
printf("error!");
exit(-1);
}
srand(time(NULL));
float time_during;
int first_int;
int second_int;
int third_int;
int fourth_int;
int flag;
char string[26];
for(int i = 0;i < 10000;i++)
{
time_during = (rand()%7000)/1000.000+34.678;
first_int = rand()%13+26;
second_int = rand()%14+26;
third_int = rand()%1774+1995;
fourth_int = rand()%1980+11177;
flag = rand()%2+1;
sprintf(string,"%2.3f,%2d,%2d,%4d,%5d,%1d\n",time_during,first_int,second_int,third_int,fourth_int,flag);
// printf("%s",string);
fwrite(string,sizeof(string),1,fp);
}
fclose(fp);
return 0;
}
当然你也可以把这两个文件合成一个文件.
其他的数据放大与这思路类似.