#include
#include
#include
#include
#include
void exit_with_help()
{
printf(
"Usage: svm-scale [options] data_filename/n"
"options:/n"
"-l lower : x scaling lower limit (default -1)/n"
"-u upper : x scaling upper limit (default +1)/n"
"-y y_lower y_upper : y scaling limits (default: no y scaling)/n"
"-s save_filename : save scaling parameters to save_filename/n"
"-r restore_filename : restore scaling parameters from restore_filename/n"
);
exit(1);
}
char *line = NULL; //用于读入每一个训练样本的数据
int max_line_len = 1024;//存储数据文件的每一行最大为1024个字节
double lower=-1.0,upper=1.0,y_lower,y_upper;//lower,upper要缩放到目标范围。y_lower,y_upper未知???
int y_scaling = 0;//????
double *feature_max;
double *feature_min;
double y_max = -DBL_MAX;//和y_scaling有关
double y_min = DBL_MAX;
int max_index;//遍历所有样本,最大的属性个数
long int num_nonzeros = 0;
long int new_num_nonzeros = 0;
#define max(x,y) (((x)>(y))?(x):(y))
#define min(x,y) (((x)<(y))?(x):(y))
void output_target(double value);//和y_scaling有关???
void output(int index, double value);//对一个样本的一个属性数据(index:value)进行缩放,并且对空白数据(index:0值的数据)
char* readline(FILE *input);
int main(int argc,char **argv)
{
int i,index;
FILE *fp, *fp_restore = NULL;
char *save_filename = NULL;
char *restore_filename = NULL;
for(i=1;i { if(argv[i][0] != '-') break;//用户没有指定-l, -u, -y, -s, -r的值 ++i; switch(argv[i-1][1]) { case 'l': lower = atof(argv[i]); break; case 'u': upper = atof(argv[i]); break; case 'y': y_lower = atof(argv[i]); ++i; y_upper = atof(argv[i]); y_scaling = 1; break; case 's': save_filename = argv[i]; break; case 'r': restore_filename = argv[i]; break; default: fprintf(stderr,"unknown option/n"); exit_with_help(); } } if(!(upper > lower) || (y_scaling && !(y_upper > y_lower))) { fprintf(stderr,"inconsistent lower/upper specification/n"); exit(1); } if(restore_filename && save_filename) { fprintf(stderr,"cannot use -r and -s simultaneously/n"); exit(1); } if(argc != i+1) exit_with_help(); fp=fopen(argv[i],"r");//读入待缩放文件 if(fp==NULL) { fprintf(stderr,"can't open file %s/n", argv[i]); exit(1); } line = (char *) malloc(max_line_len*sizeof(char));//开辟1024的空间 #define SKIP_TARGET/ while(isspace(*p)) ++p;/ while(!isspace(*p)) ++p; #define SKIP_ELEMENT/ while(*p!=':') ++p;/ ++p;/ while(isspace(*p)) ++p;/ while(*p && !isspace(*p)) ++p; /* assumption: min index of attributes is 1 */ //注意:这里默认最小index为1,不是0!!! /* pass 1: find out max index of attributes */ max_index = 0; if(restore_filename) //如果指定-r参数,则从其后面指定的文件中找到最大的属性个数 { int idx, c; fp_restore = fopen(restore_filename,"r"); if(fp_restore==NULL) { fprintf(stderr,"can't open file %s/n", restore_filename); exit(1); } c = fgetc(fp_restore); if(c == 'y') { readline(fp_restore); readline(fp_restore); readline(fp_restore); } readline(fp_restore); readline(fp_restore); while(fscanf(fp_restore,"%d %*f %*f/n",&idx) == 1) max_index = max(idx,max_index); //和现有值比较 rewind(fp_restore); } while(readline(fp)!=NULL) //待缩放文件中读入一个样本数据 { char *p=line; SKIP_TARGET while(sscanf(p,"%d:%*f",&index)==1) //此行数据中不断读入index值(注意有可能出现跳跃现象,如: 1:value1 3:value3 ) { max_index = max(max_index, index); //求出最大的index值,存于max_index,主要为了feature_max,feature_min开辟空间 SKIP_ELEMENT num_nonzeros++; } } rewind(fp);//重新指向文件流的开头 feature_max = (double *)malloc((max_index+1)* sizeof(double)); feature_min = (double *)malloc((max_index+1)* sizeof(double)); if(feature_max == NULL || feature_min == NULL) { fprintf(stderr,"can't allocate enough memory/n"); exit(1); } for(i=0;i<=max_index;i++) { feature_max[i]=-DBL_MAX; feature_min[i]=DBL_MAX; } /* pass 2: find out min/max value */ while(readline(fp)!=NULL)//读入一个样本数据 { char *p=line; int next_index=1; double target; double value; sscanf(p,"%lf",&target);//y_scaling 有关?????? y_max = max(y_max,target); y_min = min(y_min,target); SKIP_TARGET while(sscanf(p,"%d:%lf",&index,&value)==2) //从一个样本中读入每一个index和对应的value { for(i=next_index;i { feature_max[i]=max(feature_max[i],0); //这两句是为了出现index0:value0 index2:value2的时候,将中间省略的0项 feature_min[i]=min(feature_min[i],0);//index1:value1也考虑进内 } feature_max[index]=max(feature_max[index],value);//取最大值 feature_min[index]=min(feature_min[index],value);//取最小值 SKIP_ELEMENT next_index=index+1; } for(i=next_index;i<=max_index;i++) //对于读入此样本数据,可能总属性个数不能达到最大个数max_index,此处需要考虑此样本 //后面省略的那些0值 { feature_max[i]=max(feature_max[i],0); feature_min[i]=min(feature_min[i],0); } } rewind(fp); /* pass 2.5: save/restore feature_min/feature_max */ if(restore_filename) { /* fp_restore rewinded in finding max_index */ int idx, c; double fmin, fmax; if((c = fgetc(fp_restore)) == 'y') { fscanf(fp_restore, "%lf %lf/n", &y_lower, &y_upper); fscanf(fp_restore, "%lf %lf/n", &y_min, &y_max); y_scaling = 1; } else ungetc(c, fp_restore); if (fgetc(fp_restore) == 'x') { fscanf(fp_restore, "%lf %lf/n", &lower, &upper); while(fscanf(fp_restore,"%d %lf %lf/n",&idx,&fmin,&fmax)==3)//如果指定-r,则feature_max,feature_min的值以 //fp_restore中存的每一个列的最大最小值为准,前面从待缩放文件中求出的每列最大最小值就没用了。此时缩放之后的值有可能不在 //[lower,upper]之内。很明显这是合理的,因为待测试的样本(即需要-r来缩放的,都是不可靠的数据,所以不能将他们的最值作为缩 //放的标准,而用-s参数时,因为样本数据都是训练样本都是经过标记的,所以可以作为缩放的标准 { if(idx<=max_index) { feature_min[idx] = fmin; feature_max[idx] = fmax; } } } fclose(fp_restore); } if(save_filename)//将feature_max,feature_min存成文件 { FILE *fp_save = fopen(save_filename,"w"); if(fp_save==NULL) { fprintf(stderr,"can't open file %s/n", save_filename); exit(1); } if(y_scaling) { fprintf(fp_save, "y/n"); fprintf(fp_save, "%.16g %.16g/n", y_lower, y_upper); fprintf(fp_save, "%.16g %.16g/n", y_min, y_max); } fprintf(fp_save, "x/n"); fprintf(fp_save, "%.16g %.16g/n", lower, upper); for(i=1;i<=max_index;i++) { if(feature_min[i]!=feature_max[i]) fprintf(fp_save,"%d %.16g %.16g/n",i,feature_min[i],feature_max[i]); } fclose(fp_save); } /* pass 3: scale */ while(readline(fp)!=NULL)//对样本数据进行缩放 { char *p=line; int next_index=1; double target; double value; sscanf(p,"%lf",&target); output_target(target); SKIP_TARGET while(sscanf(p,"%d:%lf",&index,&value)==2) { for(i=next_index;i output(i,0); output(index,value);//对非0值进行缩放 SKIP_ELEMENT next_index=index+1; } for(i=next_index;i<=max_index;i++)//对剩下的空值也以0值进行缩放 output(i,0); printf("/n"); } if (new_num_nonzeros > num_nonzeros) fprintf(stderr, "Warning: original #nonzeros %ld/n" " new #nonzeros %ld/n" "Use -l 0 if many original feature values are zeros/n", num_nonzeros, new_num_nonzeros); free(line); free(feature_max); free(feature_min); fclose(fp); return 0; } char* readline(FILE *input)//读入一个样本,注意是一个样本,不是一个行数据 { int len; if(fgets(line,max_line_len,input) == NULL)//读入一行 return NULL; while(strrchr(line,'/n') == NULL)//判断是不是到了一个样本的末尾‘/n' { max_line_len *= 2; line = (char *) realloc(line, max_line_len);//将空间加2倍 len = (int) strlen(line); if(fgets(line+len,max_line_len-len,input) == NULL)//读入一行的新的数据,并加到line的后面 break; } return line; } void output_target(double value) { if(y_scaling) { if(value == y_min) value = y_lower; else if(value == y_max) value = y_upper; else value = y_lower + (y_upper-y_lower) * (value - y_min)/(y_max-y_min); } printf("%g ",value); } void output(int index, double value) { /* skip single-valued attribute */ if(feature_max[index] == feature_min[index])//一列的最大值和最小值相等,不用处理 return; if(value == feature_min[index]) value = lower; else if(value == feature_max[index]) value = upper; else value = lower + (upper-lower) * (value-feature_min[index])/ (feature_max[index]-feature_min[index]); if(value != 0) { printf("%d:%g ",index, value); new_num_nonzeros++; } }