weka数据集预处理

1.     利用有监督的离散算法对数据集的属性进行离散,并保存离散后的数据集;


import java.io.File;

import weka.filters.SupervisedFilter;
import java.io.IOException;


import weka.core.Instances;
import weka.core.converters.CSVLoader;
import weka.core.converters.ConverterUtils.DataSink;
import weka.core.converters.ConverterUtils.DataSource;
import weka.filters.*;
import weka.filters.supervised.attribute.Discretize;
//unsupervised
import weka.filters.unsupervised.attribute.AddID;


public class preprocess {


/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
Instances instances = DataSource.read("C:/Users/PC/Desktop/segment-challenge.arff");
instances.setClassIndex(instances.numAttributes() - 1); 
Discretize discretize = new Discretize();

System.err.println(instances.toSummaryString()); 

AddID filter = new AddID(); 

String[] options = new String[6]; 


options[0] = "-B"; 


options[1] = "8"; 


options[2] = "-M"; 


options[3] = "-1.0"; 


options[4] = "-R"; 


options[5] = "2-last"; 


discretize.setOptions(options); 


discretize.setInputFormat(instances); 


Instances newInstances2 = Filter.useFilter(instances, discretize); 


System.err.println(newInstances2.toSummaryString()); 


DataSink.write("data/1.arff", newInstances2); 



}

}



2   利用weka中的算法对segment-challenge.arff数据集进行标准化处理,并保存标准化后的数据集


import java.io.File;
import weka.filters.SupervisedFilter;
import java.io.IOException;


import weka.core.Instances;
import weka.core.converters.CSVLoader;
import weka.core.converters.ConverterUtils.DataSink;
import weka.core.converters.ConverterUtils.DataSource;
import weka.filters.*;
import weka.filters.unsupervised.attribute.*;
//unsupervised
import weka.filters.unsupervised.attribute.AddID;
public class preprocess {


/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
Instances instances = DataSource.read("C:/Users/PC/Desktop/segment-challenge.arff");
instances.setClassIndex(instances.numAttributes() - 1); 
Normalize normalize  = new Normalize ();

System.err.println(instances.toSummaryString()); 


String[] options = new String[6]; 


options[0] = "-B"; 


options[1] = "8"; 


options[2] = "-M"; 


options[3] = "-1.0"; 


options[4] = "-R"; 


options[5] = "2-last"; 


normalize.setOptions(options); 


normalize.setInputFormat(instances); 


Instances newInstances2 = Filter.useFilter(instances, normalize); 


System.err.println(newInstances2.toSummaryString()); 


DataSink.write("C:/Users/PC/Desktop/2.arff", newInstances2); 



}
}







http://blog.sina.com.cn/s/blog_6f611c30010185kz.html

http://blog.163.com/shen_960124/blog/static/60730984201502884651349/







你可能感兴趣的:(数据挖掘)