今天帮同学处理数据, 主要是从1w多条记录中随机获取8k条, 然后再从8k条记录中随机获取2k条记录. 最后将2k条记录中随机分成10组,使得每组的记录都不重复.
下面将我的代码都贴上来, 好以后处理csv文件.
package spt.csv; import java.io.Serializable; import java.nio.charset.Charset; import spt.util.PropertyConfig; /** * CSV文件操作基础类. */ abstract public class CSVBasic implements Serializable { private Charset charset; //编码. private char delimiter; //分隔符. private String fileName; /** * 默认编码. * * @return */ public static Charset getDefaultCharset() { return Charset.forName(PropertyConfig.getProperty("charset")); } /** * 默认分割符. * * @return */ public static char getDefaultDelimiter() { return PropertyConfig.getProperty("delimiter").charAt(0); } public String getFileName() { return fileName; } public void setFileName(String fileName) { this.fileName = fileName; } public Charset getCharset() { return charset; } public void setCharset(Charset charset) { this.charset = charset; } public void setDelimiter(char delimiter) { this.delimiter = delimiter; } public char getDelimiter() { return delimiter; } public CSVBasic() {} /**使用默认的分隔符和编码. * @param fileName */ public CSVBasic(String fileName) { this(fileName, getDefaultDelimiter(), getDefaultCharset()); } public CSVBasic(String fileName, char delimiter, Charset charset) { setFileName(fileName); setDelimiter(delimiter); setCharset(charset); } /** * */ private static final long serialVersionUID = 7916808982930771124L; }
3.读取csv文件,并映射记录为List<Map<String, String>> 对象:
package spt.csv; import java.io.FileNotFoundException; import java.io.IOException; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import spt.util.PropertyConfig; import com.csvreader.CsvReader; /** * 读取csv文件的类. */ public class Reader extends CSVBasic { private CsvReader reader; public CsvReader getReader() { return reader; } public void setReader(CsvReader reader) { this.reader = reader; } public Reader(String fileName) throws FileNotFoundException { this(fileName, getDefaultDelimiter(), getDefaultCharset()); } public Reader(String fileName, char delimiter, Charset charset) throws FileNotFoundException { // set before getting. super(fileName, delimiter, charset); setReader(new CsvReader(fileName, delimiter, charset)); } /**根据字段列表,见每条记录映射为一个Map对象的列表. * @param fieldNames * 指定配置文件中字段名的'键'的列表. * @return */ public List<Map<String, String>> getResult(List<String> fieldNames) { // 每行中的每一个项是一个Map<String, String>的键值对. List<Map<String, String>> lines = new ArrayList<Map<String, String>>(); CsvReader r = null; try { r = getReader(); r.readHeaders(); // 读取表头. Map<String, String> itemMap = null; // 每一条记录是一个Map<String, String>. while (r.readRecord()) { itemMap = new HashMap<String, String>(); String k = null; // 每一条记录添加键值对. for (String fieldName : fieldNames) { // 字段名. k = PropertyConfig.getProperty(fieldName); itemMap.put(k, r.get(k)); } lines.add(itemMap); } return lines; } catch (IOException e) { e.printStackTrace(); return null; } finally { if(r != null) r.close(); } } @Override public String toString() { return getFileName(); } /** * */ private static final long serialVersionUID = -1712774594374451546L; }
4.将List<Map<String, String>>输出为csv文件的类:
package spt.csv; import java.io.IOException; import java.nio.charset.Charset; import java.util.List; import java.util.Map; import com.csvreader.CsvWriter; /** * csv文件写入类. */ public class Writer extends CSVBasic { private CsvWriter writer = null; public boolean write(List<String> fieldNames, List<Map<String, String>> mapList) { CsvWriter writer = null; try { writer = getWriter(); // 写入表头. writer.writeRecord((String[]) fieldNames .toArray(new String[fieldNames.size()])); for (Map<String, String> map : mapList) { // 存储每行记录. String[] records = new String[fieldNames.size()]; for (int i = 0; i < fieldNames.size(); i++) records[i] = map.get(fieldNames.get(i)); // 写入每行记录. writer.writeRecord(records); } return true; } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); return false; } finally { if (writer != null) writer.close(); } } public Writer() { this(null, getDefaultDelimiter(), getDefaultCharset()); } public Writer(String fileName) { this(fileName, getDefaultDelimiter(), getDefaultCharset()); } public Writer(String fileName, char delimiter, Charset charset) { super(fileName, delimiter, charset); writer = new CsvWriter(fileName, delimiter, charset); } public CsvWriter getWriter() { return writer; } public void setWriter(CsvWriter writer) { this.writer = writer; } /** * */ private static final long serialVersionUID = -9141083858975437622L; }
5.表中有一个字段NYR, 表示时间, 由于需要将结果按照时间的先后顺序排序, 所以定义一个比较器:
package spt.csv; import java.text.ParseException; import java.util.Comparator; import java.util.Map; import spt.util.DateService; import spt.util.PropertyConfig; /** *每条记录是一个Map对象,按照每条记录中的'时间'的列进行排序. */ public class RecordDateComparator implements Comparator<Map<String, String>> { @Override public int compare(Map<String, String> m1, Map<String, String> m2) { try { long l01 = DateService.getDate(m1.get(PropertyConfig.getProperty("NYR"))).getTime(); long l02 = DateService.getDate(m2.get(PropertyConfig.getProperty("NYR"))).getTime(); //long的范围和int的范围不同. long diff = l01 - l02; if(diff < 0) return -1; else if(diff > 0) return 1; return 0; } catch (ParseException e) { e.printStackTrace(); return 0; } } }
6,在main类中:
package spt.csv; import java.io.File; import java.io.FileNotFoundException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Random; import spt.util.PropertyConfig; /** * 从1w多条记录中先选出8k条,然后在8k条记录中选出2k条,最后将2k条记录分成10组. */ public class ReadWriteDemo { /** * @param args */ public static void main(String[] args) { // if (args.length < 1) // throw new NullPointerException("请指定文件路径"); System.out.println("执行中...执行过程请不要关闭此窗口!"); final int first_size = Integer.parseInt(PropertyConfig .getProperty("first_size")); // 初次提取长度(8k). final int second_size = Integer.parseInt(PropertyConfig .getProperty("second_size")); // 初次提取(2k). final int groupCount = Integer.parseInt(PropertyConfig .getProperty("groupCount")); // 分组个数(10). String file = PropertyConfig.getProperty("input_file"); // 源文件路径. List<String> fieldNames = null; try { fieldNames = initFields(); Reader csv = new Reader(file); // 总记录. List<Map<String, String>> totalList = csv.getResult(fieldNames); // 初次提取的值(8k). List<Map<String, String>> firstTaken = random(totalList, first_size); // 再次提取的值(2k). List<Map<String, String>> secondTaken = random(firstTaken, second_size); // 每组记录数(2百). List<Map<String, String>> tmpTaken = secondTaken; for (int i = 0; i < groupCount; i++) { List<Map<String, String>> AGroupTaken = random(tmpTaken, second_size / groupCount); // 除去上次已经使用的元素. tmpTaken.removeAll(AGroupTaken); // 在当前目录上输出(并验证是否存在). String outputFile = null; // 如果文件已存在,则自动命名. int fileCount = 0; do { outputFile = "result" + fileCount++ + ".csv"; } while (new File(outputFile).exists()); Writer writer = new Writer(outputFile); // (集合)排序. Collections.sort(AGroupTaken, new RecordDateComparator()); writer.write(fieldNames, AGroupTaken); } System.out.println("done!"); } catch (FileNotFoundException e) { System.out.println("请指定正确的文件路径!"); // TODO Auto-generated catch block e.printStackTrace(); } } /** * 随机产生新的列表(长度比原来小). * * @param originalList * 输入列表. * @param new_size * 新列表的长度. */ public static List<Map<String, String>> random( List<Map<String, String>> originalList, int new_size) { if (new_size <= 0 || new_size > originalList.size()) throw new IndexOutOfBoundsException("新列表的长度错误!"); List<Map<String, String>> newList = new ArrayList<Map<String, String>>( new_size); // 标识是否已被提取. boolean[] taken = new boolean[originalList.size()]; Random r = new Random(); Map<String, String> map = null; // 即将获取的元素. int rIdx = 0; for (int i = 0; i < new_size; i++) { do { rIdx = r.nextInt(new_size); map = originalList.get(rIdx); } while (taken[rIdx]); // 如果发现已经提取,则重复操作. taken[rIdx] = true; // 标识已被提取. newList.add(map); } return newList; } private static List<String> initFields() { // 所有字段. List<String> fieldNames = new ArrayList<String>(14); fieldNames.add("id"); fieldNames.add("AJMC"); fieldNames.add("JYAQ"); fieldNames.add("AJLB"); fieldNames.add("AJFAB"); fieldNames.add("AJZT"); fieldNames.add("BASJ"); fieldNames.add("FXSJ"); fieldNames.add("FASJSX"); fieldNames.add("FASJXX"); fieldNames.add("AJBH"); fieldNames.add("ZBX"); fieldNames.add("ZBY"); fieldNames.add("NYR"); return fieldNames; } }
7,用到的自定义工具类为:
package spt.util; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.Date; /** * 2015-2-27 提供日期转换的工具类. */ public class DateService { // 定义称线程共享,而不是没调用一次就创建一个对象. private static DateFormat formater = new SimpleDateFormat(PropertyConfig.getProperty("date_format")); /** * 将字符串类型的日期转换为Date. * * @param strDate * @return * @throws ParseException */ public static Date getDate(String strDate) throws ParseException { // 如果输入为空,则返回null. if (Str.isEmpty(strDate)) return null; return formater.parse(strDate); } /** * 将java.util.Date转换为java.sql.Date;用于诸如'PreparedStatement.setDate'方法. * * @param utilDate * @return */ public static java.sql.Date getSQLDate(java.util.Date utilDate) { if (utilDate == null) return null; return new java.sql.Date(utilDate.getTime()); } /** * 将指定的日期转换为 * * @param date * @return */ public static String getDateStr(java.util.Date date) { if (date == null) return null; return formater.format(date); } /** * 计算指定日期与今天的间隔,判断是否是需要日期. disDay表示与今天相隔天数,0:等于今天;1:明天;-1:昨天. * * @param anotherDate * @param disDay * @return */ public static boolean isSpecifiedDay(Date anotherDate, int disDay) { if (anotherDate == null) return false; Calendar cNow = Calendar.getInstance(); cNow.setTime(new Date()); // 每调用一次,都是与当前时间做比较. cNow.add(Calendar.DAY_OF_MONTH, disDay); Calendar cAnotherDate = Calendar.getInstance(); cAnotherDate.setTime(anotherDate); return cNow.get(Calendar.YEAR) == cAnotherDate.get(Calendar.YEAR) && cNow.get(Calendar.MONTH) == cAnotherDate.get(Calendar.MONTH) && cNow.get(Calendar.DAY_OF_MONTH) == cAnotherDate.get(Calendar.DAY_OF_MONTH); } }
package spt.util; import java.io.IOException; import java.net.URL; import java.util.Properties; /** * 2015-2-27 */ public class PropertyConfig { /** * @param key * @return */ public static String getProperty(String key) { Properties properties = getProperties(); return properties.getProperty(key); } /** * @param resources * @return */ public static Properties getProperties() { final String configFilePath = "raw/properties.properties"; URL url = PropertyConfig.class.getClassLoader().getResource(configFilePath); Properties props = new Properties(); try { props.load(url.openStream()); } catch (IOException e) { e.printStackTrace(); return null; } return props; } }
package spt.util; /** *字符串工具类. */ public class Str { /** * 判断一个字符串是否有内容. * * @param str * @return 如果不不为空,则返回true,否则返回false. */ public static boolean hasLength(String str) { return !isEmpty(str); } /**判断字符串是否为空. * @param str * @return */ public static boolean isEmpty(String str) { return str == null || str.isEmpty(); } }
其中,配置文件"raw/properties.properties"是放置在src目录下.