现给定一个一千条的原始数据的txt文件,要求清洗掉多余字符,按照空格和换行规则导入hive中。
1、导入txt文件
使用BufferedReader方法导入txt文件,准备进行处理。
2、清洗数据
使用字符串分割函数split()将数据按照空格、/、+、,等字符进行分割。
3、导出txt文件
使用FileWriter方法导出txt文件,准备进行上传。
4、上传文件
将导出清洗完毕的数据文件上传至hdfs中。
5、导入hive
从hdfs中将文件导入hive。
程序如下
import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.sql.Connection; import java.sql.PreparedStatement; import java.util.ArrayList; public class CleanData { public static ArrayListip = new ArrayList (); public static ArrayList date = new ArrayList (); public static ArrayList day = new ArrayList (); public static ArrayList traffic = new ArrayList (); public static ArrayList type = new ArrayList (); public static ArrayList id = new ArrayList (); public static void cleanData() throws IOException { String str; File f = new File("/home/ryq1998/Documents/Tencent Files/316703799/FileRecv/result.txt"); BufferedReader bf = new BufferedReader(new FileReader(f)); try { while ((str = bf.readLine()) != null) { String[] s = str.split(","); ip.add(s[0]); String[] newdate = s[1].split("\\\\|\\:|\\b|\\+"); date.add(newdate[4] + "-" + "11" + "-" + newdate[0] + " " + newdate[6] + ":" + newdate[8] + ":" + newdate[10]); day.add(s[2]); String[] newtriffic = s[3].split(" "); traffic.add(Long.parseLong(newtriffic[0])); type.add(s[4]); id.add(s[5]); } } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { bf.close(); returnText(ip, date, day, traffic, type, id); /* * 将数据插入mysql数据库 */ /*addSql(ip, date, day, traffic, type, id);*/ } } /* * 存储mysql数据库 */ public static void addSql(ArrayList ip, ArrayList date, ArrayList day, ArrayList traffic, ArrayList type, ArrayList id) { Connection con = null; try { con = JdbcUtils.getConnection(); PreparedStatement psql; for (int i = 0; i < ip.size(); i++) { psql = con.prepareStatement( "insert into CleanData(ip,date,day,traffic,type,id) " + "values(?,?,?,?,?,?)"); psql.setString(1, ip.get(i)); psql.setString(2, date.get(i)); psql.setString(3, day.get(i)); psql.setLong(4, traffic.get(i)); psql.setString(5, type.get(i)); psql.setString(6, id.get(i)); psql.executeUpdate(); psql.close(); } con.close(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void returnText(ArrayList ip, ArrayList date, ArrayList day, ArrayList traffic, ArrayList type, ArrayList id) { FileWriter fileWriter = null; try { fileWriter = new FileWriter("/home/ryq1998/result.txt");//创建文本文件 int i=0; for(;i ) { if(i==ip.size()-1) { fileWriter.write(ip.get(i)+" "+date.get(i)+" "+day.get(i)+" "+traffic.get(i)+" "+type.get(i)+" "+id.get(i)); break; } fileWriter.write(ip.get(i)+" "+date.get(i)+" "+day.get(i)+" "+traffic.get(i)+" "+type.get(i)+" "+id.get(i)+"\r");//写入 \r\n换行 } fileWriter.flush(); fileWriter.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void main(String[] args) throws IOException { cleanData(); } }
截图如下