微博数据清洗(Java版)

大数据公益大学提供的一份数据,义务处理一下,原始数据是Excel,含有html标签,如下:



要求清洗掉html标签,和微博内容中的url地址。


主要分为两部分:

1.处理文本,清洗数据。

2.处理excel读写操作。


上代码:

ExcelUtil类,包含Excel2003-2007的读写操作,Excel使用Apache POI进行操作,需要jar包如下:

微博数据清洗(Java版)_第1张图片

package dat.datadeal;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFCellStyle;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

/**
 * 
 * @author daT [email protected]
 *2003,2007版excel读写工具
 */
public class ExcelUtil{
	
	/**
	 * Excel文件读取
	 * @param filePath
	 * @return String[]存的是行,List存的是列。
	 * 一个excel一次全部读入内存(Excel超大需要另行处理)
	 */
	public  List<String[]> readExcel(String filePath) {  
        List<String[]> dataList = new ArrayList<String[]>();  
        boolean isExcel2003 = true;  
        if (isExcel2007(filePath)) {  
            isExcel2003 = false;  
        }  
        File file = new File(filePath);  
        InputStream is = null;  
        try {  
            is = new FileInputStream(file);  
        } catch (FileNotFoundException ex) {  
            Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);  
        }  
        Workbook wb = null;  
        try {  
            wb = isExcel2003 ? new HSSFWorkbook(is) : new XSSFWorkbook(is);  
        } catch (IOException ex) {  
            Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);  
        }  
        Sheet sheet = wb.getSheetAt(0);  
        int totalRows = sheet.getPhysicalNumberOfRows();  
        int totalCells = 0;  
        if (totalRows >= 1 && sheet.getRow(0) != null) {  
            totalCells = sheet.getRow(0).getPhysicalNumberOfCells();  
        }  
        for (int r = 0; r < totalRows; r++) {  
            Row row = sheet.getRow(r);  
            if (row == null) {  
                continue;  
            }  
            String[] rowList = new String[totalCells];  
            for (int c = 0; c < totalCells; c++) {  
                Cell cell = row.getCell(c);  
                String cellValue = "";  
                if (cell == null) {  
                    rowList[c] = (cellValue);  
                    continue;  
                }  
                cellValue = ConvertCellStr(cell, cellValue);  
                rowList[c] = (cellValue);  
            }  
            dataList.add(rowList);  
        }  
        return dataList;  
    }  
	
	
	private String ConvertCellStr(Cell cell, String cellStr) {  
        switch (cell.getCellType()) {  
            case Cell.CELL_TYPE_STRING:  
                // 读取String  
                cellStr = cell.getStringCellValue().toString();  
                break;  
            case Cell.CELL_TYPE_BOOLEAN:  
                // 得到Boolean对象的方法  
                cellStr = String.valueOf(cell.getBooleanCellValue());  
                break;  
            case Cell.CELL_TYPE_NUMERIC:  
                // 先看是否是日期格式  
                if (DateUtil.isCellDateFormatted(cell)) {  
                    // 读取日期格式  
                    cellStr = formatTime(cell.getDateCellValue().toString());  
                } else {  
                    // 读取数字  
                    cellStr = String.valueOf(cell.getNumericCellValue());  
                }  
                break;  
            case Cell.CELL_TYPE_FORMULA:  
                // 读取公式  
                cellStr = cell.getCellFormula().toString();  
                break;  
        }  
        return cellStr;  
    }  
  


	private boolean isExcel2007(String fileName) {  
	        return fileName.matches("^.+\\.(?i)(xlsx)$");  
	 } 
	 
	private String formatTime(String s) {  
        SimpleDateFormat sf = new SimpleDateFormat("EEE MMM dd hh:mm:ss z yyyy", Locale.ENGLISH);  
        Date date = null;  
        try {  
            date = sf.parse(s);  
        } catch (ParseException ex) {  
            Logger.getLogger(ExcelUtil.class.getName()).log(Level.SEVERE, null, ex);  
        }  
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");  
        String result = sdf.format(date);  
        return result;  
    }  
	 
	
	/**
	 * Excel写操作,简单起见还是采用内存数据一次写入
	 * @param filePath 输出文件路径名
	 * @param dataList 输出文件内容,List<String>行  List列
	 * @throws IOException
	 */
	public 	void writeExcel(String filePath,List<List<String>> dataList) throws IOException{
	        HSSFWorkbook wb = new HSSFWorkbook();  
	        HSSFSheet sheet = wb.createSheet("sheet");// 添加sheet  
	        // 表格样式  
	        HSSFCellStyle style = wb.createCellStyle();  
	        style.setAlignment(HSSFCellStyle.ALIGN_CENTER);// 指定单元格居中对齐  
	        // // 边框  
	        // style.setBorderBottom(HSSFCellStyle.BORDER_MEDIUM);  
	        // style.setBorderTop(HSSFCellStyle.BORDER_MEDIUM);  
	        // style.setBorderLeft(HSSFCellStyle.BORDER_MEDIUM);  
	        // style.setBorderRight(HSSFCellStyle.BORDER_MEDIUM);  
	        // //设置字体  
	        // HSSFFont f = wb.createFont();  
	        // f.setFontHeightInPoints((short)10);  
	        // f.setBoldweight(HSSFFont.BOLDWEIGHT_NORMAL);  
	        // style.setFont(f);  
	        // //设置列宽  
	        // sheet.setColumnWidth((short)0, (short)9600);  
	        // sheet.setColumnWidth((short)1, (short)4000);  
	        // sheet.setColumnWidth((short)2, (short)8000);  
	        // sheet.setColumnWidth((short)3, (short)8000);  
	  
	        // 在索引0的位置创建第一行  
	  
	        for (int i = 0; i < dataList.size(); i++) {  
	            HSSFRow row = sheet.createRow(i);  
	            List<String> list = dataList.get(i);  
	            for (int j = 0; j < list.size(); j++) {  
	                HSSFCell cell = row.createCell(j);  
	                cell.setCellValue(list.get(j));  
	                cell.setCellStyle(style);  
	            }  
	        }  
	        // 导出文件  
	        FileOutputStream fout = new FileOutputStream(filePath);  
	        wb.write(fout);  
	        fout.close();  
	}
	
}

DataClean类,包含对html标签,信息中url的的清洗。

package dat.datadeal;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 
 * @author daT [email protected]
 *
 */
public class DataClean {
	
	/**
	 * 清洗html标签
	 * @param inputString
	 * @return
	 */
	public static String delHtml(String inputString) {
        String htmlStr = inputString; // 含html标签的字符串
        String textStr = "";
        java.util.regex.Pattern p_script;
        java.util.regex.Matcher m_script;
        java.util.regex.Pattern p_html;
        java.util.regex.Matcher m_html;
        try {
            String regEx_html = "<[^>]+>"; // 定义HTML标签的正则表达式
            String regEx_script = "<[/s]*?script[^>]*?>[/s/S]*?<[/s]*?//[/s]*?script[/s]*?>"; // 定义script的正则表达式{或<script[^>]*?>[/s/S]*?<//script>
            p_script = Pattern.compile(regEx_script, Pattern.CASE_INSENSITIVE);
            m_script = p_script.matcher(htmlStr);
            htmlStr = m_script.replaceAll(""); // 过滤script标签
            p_html = Pattern.compile(regEx_html, Pattern.CASE_INSENSITIVE);
            m_html = p_html.matcher(htmlStr);
            htmlStr = m_html.replaceAll(""); // 过滤html标签
            textStr = htmlStr;
        } catch (Exception e) {
            System.err.println("Html2Text: " + e.getMessage());
        }
        return textStr;// 返回文本字符串
    }
	
	/**
	 * 处理掉信息中的url地址
	 */
	public static String dealWithUrl(String str){
		String regEx = "[http|https]+[://]+[0-9A-Za-z:/[-]_#[?][=][.][&]]*";
        Pattern p = Pattern.compile(regEx);   
        Matcher m = p.matcher(str);
        return m.replaceAll("");
	}
	
	
	public static void main(String[] args) throws IOException{
		ExcelUtil excelUtil = new ExcelUtil();
		List<List<String>> writeList = new ArrayList<List<String>>();
		List<String[]> readList =excelUtil.readExcel("/home/dat/javatest/微博数据_.xlsx");
		for(String[] lineArray:readList){
			List<String> strList = new ArrayList<String>();
			for(String str:lineArray){
				String strTmp = DataClean.dealWithUrl(DataClean.delHtml(str));
				strList.add(strTmp);
				//System.out.println(strTmp);
			}
			writeList.add(strList);
		}
		
		excelUtil.writeExcel("/home/dat/javatest/weibo.xlsx",writeList);  
	    System.out.println("job has finished...........");
	}
}


清洗后数据:


你可能感兴趣的:(微博数据清洗(Java版))