数据中有excel文件的形式,如果我们不用python做,则需要通过poi插件,将excel中的内容读取出来,可以放在内存中直接处理,也可以将文件的格式转换成普通文本类型,例如txt,csv,或者无后缀名文件。可以用文本编辑工具,如editplus等打开查看,做相应的后续操作。
这里是以\001作为分隔符,防止空格的干扰,方便后续操作。本来打算以\005作为行分隔符,即保留原文中的样式,又不会受到\n的干扰。但是Spark ,MapReduce大都是以\n作为行分隔符读取,为了方便后续操作,所以我们清理了每一个单元格中的\n,然后以\n作为行分隔符。
//获取每一个单元格中的内容,如果为空将单元格内容替换为null。
if (row.getCell(z) == null || row.getCell(z).toString().equals("")) {
cell = "null";
} else {
cell = row.getCell(z).toString();
}
//将读取到的所有单元格的内容都拼接在一起,形成字符串,并清理格式去掉回车符。
if (z == cellNum - 1) {
cells = cells + cell.replaceAll("\n", "。") + "\n";
} else {
cells = cells + cell.replaceAll("\n", "。") + "\001";
}
完整代码如下:
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
public class ExcelReaderUtils {
private Workbook wb;
private Sheet sheet;
private Row row;
private Logger logger = LoggerFactory.getLogger(this.getClass());
public ExcelReaderUtils(String filePath) {
if (filePath == null) {
return;
}
String type = filePath.substring(filePath.lastIndexOf("."));
try {
FileInputStream fis = new FileInputStream(filePath);
if (".xls".equals(type)) {
wb = new HSSFWorkbook(fis);
} else if (".xlsx".equals(type)) {
wb = new XSSFWorkbook(fis);
} else {
wb = null;
}
} catch (FileNotFoundException e) {
logger.error("文件未找到", e);
} catch (IOException e) {
logger.error("IO异常", e);
}
}
public List> readExcel() {
List> list = new ArrayList<>();
int sheetNum = wb.getActiveSheetIndex();
for (int x = 0; x <= sheetNum; x++) {
sheet = wb.getSheetAt(x);
int rowNum = sheet.getLastRowNum();
List context = new ArrayList<>();
for (int y = 0; y < rowNum; y++) {
row = sheet.getRow(y);
//判断是否为空行
if (row != null) {
int cellNum = row.getLastCellNum();
String cells = "";
String cell = "";
for (int z = 0; z < cellNum; z++) {
//获取每一个单元格中的内容,如果为空将单元格内容替换为null。
if (row.getCell(z) == null || row.getCell(z).toString().equals("")) {
cell = "null";
} else {
cell = row.getCell(z).toString();
}
//将读取到的所有单元格的内容都拼接在一起,形成字符串,并清理格式去掉回车符。
if (z == cellNum - 1) {
cells = cells + cell.replaceAll("\n", "。") + "\n";
} else {
cells = cells + cell.replaceAll("\n", "。") + "\001";
}
}
context.add(cells);
}
}
list.add(context);
}
return list;
}
public void writeFile(List> contextList, String filePath){
writeFile(contextList,filePath,null);
}
public void writeFile(List> contextList, String filePath, String fileType) {
//判断文件路径是否存在,如果不存在则创建
File path = new File(filePath);
if (!path.exists()) {
System.out.println("路径不存在,正在创建...");
path.mkdirs();
System.out.println("创建完成!");
}
//判断输入的文件生成类型
String type = "";
if (fileType != null && fileType != "") {
type = "." + fileType;
}
String sheetName = null;
File file = null;
FileWriter fw = null;
for (int i=0; i context = contextList.get(i);
sheetName = wb.getSheetAt(i).getSheetName();
file = new File(path.getAbsolutePath()+"/"+sheetName+type);
try {
file.createNewFile();
fw = new FileWriter(file);
for (int j=0; j> list = reader.readExcel();
System.out.println("---------------------------开始读取Excel文件-----------------------------");
for (int i = 0; i < list.size(); i++) {
List context = list.get(i);
for (int j = 0; j < context.size(); j++) {
System.out.println(context.get(j));
}
System.out.println("------------------------------------------------------------------------");
}
System.out.println("-------------------------------读取结束---------------------------------");
String outPath = "F:/ExcelOut";
reader.writeFile(list,outPath);
}
}
pom文件依赖如下:
org.apache.poi
poi-ooxml
3.17
org.apache.poi
poi
3.17
org.apache.poi
poi-examples
3.17
org.apache.poi
poi-excelant
3.17
org.apache.poi
poi-ooxml-schemas
3.17
org.apache.poi
poi-scratchpad
3.17
dom4j
dom4j
1.6.1
stax
stax-api
1.0
org.apache.xmlbeans
xmlbeans
2.6.0