将hdfs上多个文本数据生成mllib的训练集测试集

每个文本有一列数据,将选中的几个文本按要求合并为训练集供机器学习算法使用

将单个文本的hdfs路径设置为参数,提高程序的通用性,将所有文本都追加为一个数组,随后按规定切分读写,速度不是很慢。测试效果还可以


package pack
import java.io.{File, PrintWriter}
import org.apache.spark.{SparkConf, SparkContext}
import scala.sys.process._
import scala.collection.mutable.ArrayBuffer
/**
  * Created by hemin on 2016/9/20.
  */
object ReadFile {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("mktxt").setMaster("lcoal")
    val sc = new SparkContext(conf)
    val paths = args
    getData(paths,sc)
//sark程序必要的设置
  }
  def getData(paths:Array[String],sc:SparkContext)= {
    val index = paths.length-1   //cancer the last arg
    val temp = ArrayBuffer[String]()
    val writer = new PrintWriter(new File("/home/iespark/zzh/test.txt"))
    val len=sc.textFile(paths(0)).filter(x=>(x!="")).collect().length
    //make all data to an array
    for(i<-0 to index-1){
      temp ++= sc.textFile(paths(i)).filter(x=>(x!="")).collect()
    }
    //keep a certen length
    for(i<- 0 to len-2){
      for(j<- 0 to index-1){
        if(j==0){
          writer.write(temp(i+len*j)+"  ")
        }else{
          writer.write(j+"::"+temp(i+len*j)+"  ")
        }
      }
      writer.write("\n")
    }
    writer.close()

    val txt = sc.textFile("/home/iespark/zzh/test.txt")
txt.repartition(1).saveAsTextFile("hdfs://hadoopadmin:9000/user/iespark/zzh/test")
  }
}

你可能感兴趣的:(机器学习)