Java在网页中爬取数据并存入excel

pom中引入:

		
			org.apache.httpcomponents
			httpclient
			4.5.5
		

		
		
			org.jsoup
			jsoup
			1.11.2
		
		
			org.apache.poi
			poi-ooxml
			3.16
		
	public static List getStringByWeb(URL url, String enCoded, String str) throws IOException{
		//从目标url中获取字符流到bufr
		BufferedReader bufr = new BufferedReader(new InputStreamReader(url.openStream(),enCoded));
		List list = new ArrayList();
		Pattern p = Pattern.compile(str);
		String line = null;
		while((line=bufr.readLine())!=null){
			System.out.println(line);
			Matcher m = p.matcher(line);
			//寻找文件链接
			while(m.find()){
				//将符合规则的数据存储到集合中。
				list.add(m.group());
			}
		}
		return list;
	}

	public static void writeExcel(List dataList, int cloumnCount,String finalXlsxPath){
		OutputStream out = null;
		try {
			// 获取总列数
			int columnNumCount = cloumnCount;
			// 读取Excel文档
			File finalXlsxFile = new File(finalXlsxPath);
			Workbook workBook = getWorkbok(finalXlsxFile);
			// sheet 对应一个工作页
			Sheet sheet = workBook.getSheetAt(0);
			/**
			 * 删除原有数据,除了属性列
			 */
			int rowNumber = sheet.getLastRowNum();    // 第一行从0开始算
//			System.out.println("插入数据条数(除属性列外):" + rowNumber);
			for (int i = 1; i <= rowNumber; i++) {
				Row row = sheet.getRow(i);
				sheet.removeRow(row);
			}
			// 创建文件输出流,输出电子表格:这个必须有,否则你在sheet上做的任何操作都不会有效
			out =  new FileOutputStream(finalXlsxPath);
			workBook.write(out);
			/**
			 * 往Excel中写新数据
			 */
			for (int j = 0; j < dataList.size(); j++) {
				// 创建一行:从第二行开始,跳过属性列
				Row row = sheet.createRow(j);
				// 得到要插入的每一条记录
				String dataMap = dataList.get(j);
				for (int k = 0; k <= columnNumCount; k++) {
					// 在一行内循环
					Cell first = row.createCell(0);
					first.setCellValue(dataMap);
				}
			}
			// 创建文件输出流,准备输出电子表格:这个必须有,否则你在sheet上做的任何操作都不会有效
			out =  new FileOutputStream(finalXlsxPath);
			workBook.write(out);
		} catch (Exception e) {
			e.printStackTrace();
		} finally{
			try {
				if(out != null){
					out.flush();
					out.close();
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		System.out.println("数据导出成功");
	}
	public static Workbook getWorkbok(File file) throws IOException{
		Workbook wb = null;
		FileInputStream in = new FileInputStream(file);
		if(file.getName().endsWith(EXCEL_XLS)){     //Excel 2003
			wb = new HSSFWorkbook(in);
		}else if(file.getName().endsWith(EXCEL_XLSX)){    // Excel 2007/2010
			wb = new XSSFWorkbook(in);
		}
		return wb;
	}

main方法测试(我这里爬取的是链接):

    public static void main(String[] args) {

URL url = new URL("http://xxxx");
				//访问编码
				String enCoded = "utf-8";
				//设置链接的正则表达式
				String str = "(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]";
				//使用getStringByWeb(url,enCoded,str)获取url中的链接的集合
				List list = getStringByWeb(url,enCoded,str);
				writeExcel(arrayList, arrayList.size(),"C:/xxx.xlsx");
    }

 

你可能感兴趣的:(Java)