大数据公益大学提供的一份数据,义务处理一下,原始数据是Excel,含有html标签,如下:
要求清洗掉html标签,和微博内容中的url地址。
主要分为两部分:
1.处理文本,清洗数据。
2.处理excel读写操作。
上代码:
ExcelUtil类,包含Excel2003-2007的读写操作,Excel使用Apache POI进行操作,需要jar包如下:
package dat.datadeal; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Locale; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.poi.hssf.usermodel.HSSFCell; import org.apache.poi.hssf.usermodel.HSSFCellStyle; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.DateUtil; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.xssf.usermodel.XSSFWorkbook; /** * * @author daT [email protected] *2003,2007版excel读写工具 */ public class ExcelUtil{ /** * Excel文件读取 * @param filePath * @return String[]存的是行,List存的是列。 * 一个excel一次全部读入内存(Excel超大需要另行处理) */ public List<String[]> readExcel(String filePath) { List<String[]> dataList = new ArrayList<String[]>(); boolean isExcel2003 = true; if (isExcel2007(filePath)) { isExcel2003 = false; } File file = new File(filePath); InputStream is = null; try { is = new FileInputStream(file); } catch (FileNotFoundException ex) { Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex); } Workbook wb = null; try { wb = isExcel2003 ? new HSSFWorkbook(is) : new XSSFWorkbook(is); } catch (IOException ex) { Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex); } Sheet sheet = wb.getSheetAt(0); int totalRows = sheet.getPhysicalNumberOfRows(); int totalCells = 0; if (totalRows >= 1 && sheet.getRow(0) != null) { totalCells = sheet.getRow(0).getPhysicalNumberOfCells(); } for (int r = 0; r < totalRows; r++) { Row row = sheet.getRow(r); if (row == null) { continue; } String[] rowList = new String[totalCells]; for (int c = 0; c < totalCells; c++) { Cell cell = row.getCell(c); String cellValue = ""; if (cell == null) { rowList[c] = (cellValue); continue; } cellValue = ConvertCellStr(cell, cellValue); rowList[c] = (cellValue); } dataList.add(rowList); } return dataList; } private String ConvertCellStr(Cell cell, String cellStr) { switch (cell.getCellType()) { case Cell.CELL_TYPE_STRING: // 读取String cellStr = cell.getStringCellValue().toString(); break; case Cell.CELL_TYPE_BOOLEAN: // 得到Boolean对象的方法 cellStr = String.valueOf(cell.getBooleanCellValue()); break; case Cell.CELL_TYPE_NUMERIC: // 先看是否是日期格式 if (DateUtil.isCellDateFormatted(cell)) { // 读取日期格式 cellStr = formatTime(cell.getDateCellValue().toString()); } else { // 读取数字 cellStr = String.valueOf(cell.getNumericCellValue()); } break; case Cell.CELL_TYPE_FORMULA: // 读取公式 cellStr = cell.getCellFormula().toString(); break; } return cellStr; } private boolean isExcel2007(String fileName) { return fileName.matches("^.+\\.(?i)(xlsx)$"); } private String formatTime(String s) { SimpleDateFormat sf = new SimpleDateFormat("EEE MMM dd hh:mm:ss z yyyy", Locale.ENGLISH); Date date = null; try { date = sf.parse(s); } catch (ParseException ex) { Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex); } SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); String result = sdf.format(date); return result; } /** * Excel写操作,简单起见还是采用内存数据一次写入 * @param filePath 输出文件路径名 * @param dataList 输出文件内容,List<String>行 List列 * @throws IOException */ public void writeExcel(String filePath,List<List<String>> dataList) throws IOException{ HSSFWorkbook wb = new HSSFWorkbook(); HSSFSheet sheet = wb.createSheet("sheet");// 添加sheet // 表格样式 HSSFCellStyle style = wb.createCellStyle(); style.setAlignment(HSSFCellStyle.ALIGN_CENTER);// 指定单元格居中对齐 // // 边框 // style.setBorderBottom(HSSFCellStyle.BORDER_MEDIUM); // style.setBorderTop(HSSFCellStyle.BORDER_MEDIUM); // style.setBorderLeft(HSSFCellStyle.BORDER_MEDIUM); // style.setBorderRight(HSSFCellStyle.BORDER_MEDIUM); // //设置字体 // HSSFFont f = wb.createFont(); // f.setFontHeightInPoints((short)10); // f.setBoldweight(HSSFFont.BOLDWEIGHT_NORMAL); // style.setFont(f); // //设置列宽 // sheet.setColumnWidth((short)0, (short)9600); // sheet.setColumnWidth((short)1, (short)4000); // sheet.setColumnWidth((short)2, (short)8000); // sheet.setColumnWidth((short)3, (short)8000); // 在索引0的位置创建第一行 for (int i = 0; i < dataList.size(); i++) { HSSFRow row = sheet.createRow(i); List<String> list = dataList.get(i); for (int j = 0; j < list.size(); j++) { HSSFCell cell = row.createCell(j); cell.setCellValue(list.get(j)); cell.setCellStyle(style); } } // 导出文件 FileOutputStream fout = new FileOutputStream(filePath); wb.write(fout); fout.close(); } }
package dat.datadeal; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * * @author daT [email protected] * */ public class DataClean { /** * 清洗html标签 * @param inputString * @return */ public static String delHtml(String inputString) { String htmlStr = inputString; // 含html标签的字符串 String textStr = ""; java.util.regex.Pattern p_script; java.util.regex.Matcher m_script; java.util.regex.Pattern p_html; java.util.regex.Matcher m_html; try { String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式 String regEx_script = "<[/s]*?script[^>]*?>[/s/S]*?<[/s]*?//[/s]*?script[/s]*?>"; // 定义script的正则表达式{或<script[^>]*?>[/s/S]*?<//script> p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE); m_script = p_script.matcher(htmlStr); htmlStr = m_script.replaceAll(""); // 过滤script标签 p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE); m_html = p_html.matcher(htmlStr); htmlStr = m_html.replaceAll(""); // 过滤html标签 textStr = htmlStr; } catch (Exception e) { System.err.println("Html2Text: " + e.getMessage()); } return textStr;// 返回文本字符串 } /** * 处理掉信息中的url地址 */ public static String dealWithUrl(String str){ String regEx = "[http|https]+[://]+[0-9A-Za-z:/[-]_#[?][=][.][&]]*"; Pattern p = Pattern.compile(regEx); Matcher m = p.matcher(str); return m.replaceAll(""); } public static void main(String[] args) throws IOException{ ExcelUtil excelUtil = new ExcelUtil(); List<List<String>> writeList = new ArrayList<List<String>>(); List<String[]> readList =excelUtil.readExcel("/home/dat/javatest/微博数据_.xlsx"); for(String[] lineArray:readList){ List<String> strList = new ArrayList<String>(); for(String str:lineArray){ String strTmp = DataClean.dealWithUrl(DataClean.delHtml(str)); strList.add(strTmp); //System.out.println(strTmp); } writeList.add(strList); } excelUtil.writeExcel("/home/dat/javatest/weibo.xlsx",writeList); System.out.println("job has finished..........."); } }
清洗后数据: