《2021博客之星年度总评选》数据采集Java样例程序
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0modelVersion>
<groupId>1groupId>
<artifactId>_psimplemvnartifactId>
<version>1.0-SNAPSHOTversion>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.pluginsgroupId>
<artifactId>maven-compiler-pluginartifactId>
<configuration>
<source>8source>
<target>8target>
configuration>
plugin>
plugins>
build>
<dependencies>
<dependency>
<groupId>org.seleniumhq.seleniumgroupId>
<artifactId>selenium-chrome-driverartifactId>
<version>4.0.0version>
dependency>
<dependency>
<groupId>org.apache.poigroupId>
<artifactId>poi-ooxmlartifactId>
<version>3.17version>
dependency>
dependencies>
project>
/**
* Copyright (C), 2000-2021, XXX有限公司
* FileName: BlogStarStatisticsTest
* Author: wangyetao
* Date: 21-12-26 23:38:10
* Description: 线上投票博客之星数据采集
*
* History:
* 作者姓名
*
package simple.call.blogstar;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import simple.call.util.TimeUtil;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* @ClassName: BlogStarStatisticsTest
* @Description: java类描述
* @Author: wangyetao
* @Date: 21-12-26 23:38:10
*/
public class BlogStarStatisticsTest {
private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
private static String filename = "blog_star2020";
private static String sheetname = filename.toUpperCase();
private static String suffix = ".xlsx";
private static FileOutputStream outputStream;
private static ArrayList<BlogStar> blogStars;
private static String url = "https://bss.csdn.net/m/topic/blog_star2020";//blog_star2020 url
//测试用例
public static void main(String[] args) throws InterruptedException {
//预先设置驱动
System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");
// Initialize your driver as you normally would:
ChromeDriver driver = new ChromeDriver();
driver.get(url);
//
//单条数据DOM结构
//
//
// 001
//
//
//
// ✎ℳ๓₯㎕...雲淡風輕
// 码龄6年
//
// 2020年度原创博文:77 篇
// 当前票数: 392 票
//
//
//
//
//
blogStars = new ArrayList<BlogStar>();
//稍等页面渲染完成
Thread.sleep(3000);
List<WebElement> search_results = driver.findElements(By.xpath("//*[@id=\"blogList\"]/li"));
for (int i = 0; i < search_results.size(); i++) {
WebElement element = search_results.get(i);
BlogStar blogStar = new BlogStar();
//录入时间
blogStar.createTime = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyy-MM-dd HH:mm:ss");
//序号
blogStar.num = element.findElement(By.className("num")).getText();
//博客简称 name
blogStar.name = element.findElement(By.className("name")).getText();
//头像图片 avatarurl
blogStar.avatarUrl = element.findElement(By.tagName("img")).getAttribute("src");
//码龄
blogStar.intlevel = StringUtil.getInts(element.findElement(By.className("level")).getText())[0];
//blog-num年度原创博文数、current-vote当前票数
blogStar.intBlogNum = StringUtil.getInts(element.findElement(By.className("blog-num")).getText())[1];
blogStar.intCurrentVote = StringUtil.getInts(element.findElement(By.className("current-vote")).getText())[0];
blogStars.add(blogStar);
}
driver.close();
ArrayList<String> heads = new ArrayList<String>();
heads.add("序号");
heads.add("博客简称");
heads.add("小头像url");
heads.add("码龄(年)");
heads.add("年度原创博文数");
heads.add("当前票数");
heads.add("录入时间");
//CSVUtils.createCSVFile(heads, blogStars, outPutPath, filename);
System.out.println("Creating excel");
try {
XSSFWorkbook workbook = new XSSFWorkbook();
XSSFSheet sheet = workbook.createSheet(sheetname);
//设置列宽
for (int i = 0; i < heads.size(); i++) {
if (i == 0) {
sheet.setColumnWidth(i, 6 * 256);
} else if (i == 6) {
sheet.setColumnWidth(i, 20 * 256);
} else {
sheet.setColumnWidth(i, 15 * 256);
}
}
Row row = null;
Cell cell = null;
//插入第一行数据的表头
//创建第一行
row = sheet.createRow(0);
for (int i = 0; i < heads.size(); i++) {
cell = row.createCell(i);
cell.setCellValue(heads.get(i));
}
int rowNum = 1;
int colNum = 0;
//组合表格:行、列
for (BlogStar blogStar : blogStars) {
row = sheet.createRow(rowNum++);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.num);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.name);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.avatarUrl);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.intlevel);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.intBlogNum);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.intCurrentVote);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.createTime);
colNum = 0;
}
outputStream = new FileOutputStream(outPutPath + filename + suffix);
//写入数据到Excel
workbook.write(outputStream);
//关闭流
outputStream.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Done");
}
}
/**
* Copyright (C), 2000-2021, XXX有限公司
* FileName: BlogStarStatisticsVoteLeaderboardList
* Author: wangyetao
* Date: 21-12-27 02:43:32
* Description: 投票贡献排行榜
*
* History:
* 作者姓名
* 修改时间
* 版本号
* 版本描述
*/
package simple.call.blogstar;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import simple.call.util.TimeUtil;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* @ClassName: BlogStarStatisticsVoteLeaderboardList
* @Description: java类描述
* @Author: wangyetao
* @Date: 21-12-27 02:43:32
*/
public class BlogStarStatisticsVoteLeaderboardList {
private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
private static String filename = "aa518189";
private static String sheetname = filename;
private static String suffix = ".xlsx";
private static FileOutputStream outputStream;
private static ArrayList<BlogStar> blogVotes;
private static String url = "https://bss.csdn.net/m/topic/blog_star2020/detail?username=aa518189";
//测试用例
public static void main(String[] args) throws InterruptedException {
//预先设置驱动
System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");
// Initialize your driver as you normally would:
ChromeDriver driver = new ChromeDriver();
driver.get(url);
//
//单条数据DOM结构
//
//
// 1
// swagLi
//
//
//
//
// 码龄4年
// 36票
//
//
//
blogVotes = new ArrayList<BlogStar>();
//稍等页面渲染完成
Thread.sleep(2000);
List<WebElement> search_results = driver.findElements(By.xpath("//*[@id=\"voteLeaderboardList\"]/li"));
for (int i = 0; i < search_results.size(); i++) {
WebElement element = search_results.get(i);
BlogStar blogStar = new BlogStar();
//录入时间
blogStar.createTime = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyy-MM-dd HH:mm:ss");
//编号
blogStar.num = element.findElement(By.className("num")).getText();
//博粉名称
blogStar.name = element.findElement(By.className("text")).getText();
//码龄(年)
blogStar.intlevel = StringUtil.getInts(element.findElement(By.className("code-age")).getText())[0];
//支持票数
blogStar.intCurrentVote = StringUtil.getInts(element.findElement(By.className("vote-num")).getText())[0];
blogVotes.add(blogStar);
}
driver.close();
ArrayList<String> heads = new ArrayList<String>();
heads.add("编号");
heads.add("博粉名称");
heads.add("码龄(年)");
heads.add("支持票数");
heads.add("录入时间");
//CSVUtils.createCSVFile(heads, blogVotes, outPutPath, filename);
System.out.println("Creating excel");
try {
XSSFWorkbook workbook = new XSSFWorkbook();
XSSFSheet sheet = workbook.createSheet(sheetname);
//设置列宽
for (int i = 0; i < heads.size(); i++) {
if (i == 0) {
sheet.setColumnWidth(i, 6 * 256);
} else if (i == 4) {
sheet.setColumnWidth(i, 20 * 256);
} else {
sheet.setColumnWidth(i, 15 * 256);
}
}
Row row = null;
Cell cell = null;
//插入第一行数据的表头
//创建第一行
row = sheet.createRow(0);
for (int i = 0; i < heads.size(); i++) {
cell = row.createCell(i);
cell.setCellValue(heads.get(i));
}
int rowNum = 1;
int colNum = 0;
//组合表格:行、列
for (BlogStar blogStar : blogVotes) {
row = sheet.createRow(rowNum++);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.num);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.name);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.intlevel);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.intCurrentVote);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.createTime);
colNum = 0;
}
outputStream = new FileOutputStream(outPutPath + filename + suffix);
//写入数据到Excel
workbook.write(outputStream);
//关闭流
outputStream.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Done");
}
}
/**
* Copyright (C), 2000-2021, XXX有限公司
* FileName: Blogstar2021
* Author: wangyetao
* Date: 21-12-28 15:50:02
* Description: 线上评分TOP90数据采集,输出blogstar2021.xlsx
*
* History:
* 作者姓名
* 修改时间
* 版本号
* 版本描述
*
* wangyetao
* 2022年 01月 01日 星期六 06:38:36 CST
* 版本号
* 最近一次修改
*/
package simple.call.blogstar;
import org.apache.poi.common.usermodel.HyperlinkType;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CreationHelper;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFHyperlink;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import simple.call.util.TimeUtil;
import java.io.*;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
/**
* @ClassName: Blogstar2021
* @Description: java类描述
* @Author: wangyetao
* @Date: 21-12-28 15:50:02
*/
public class Blogstar2021 {
private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
//预先在指定目录创建好blogstar2021.xlsx Excel文件,防止FileNotFoundException
private static String filename = "blogstar2021";
private static String sheetname = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyyMMddHHmmss");
private static String suffix = ".xlsx";
//用于保留Excel中的原内容
private static FileInputStream inputStream;
//用于往Excel中追加写入新内容
private static FileOutputStream outputStream;
private static ArrayList<BlogStar> blogStars;
private static String url = "https://www.csdn.net/blogstar2021";//blogstar2021 url
//测试用例
public static void main(String[] args) throws InterruptedException {
//预先设置驱动
System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");
// Initialize your driver as you normally would:
ChromeDriver driver = new ChromeDriver();
driver.get(url);
blogStars = new ArrayList<BlogStar>();
//稍等页面渲染完成
Thread.sleep(3000);
List<WebElement> lis = driver.findElement(By.className("authorscoring-nav")).findElements(By.tagName("li"));
for (int i = 0; i < lis.size(); i++) {
WebElement element = lis.get(i);
element.click();
Thread.sleep(2000);
List<WebElement> boxs = driver.findElements(By.className("scoreitem"));
for (int j = 0; j < boxs.size(); j++) {
WebElement box = boxs.get(j);
BlogStar blogStar = new BlogStar();
//领域
blogStar.field = element.getText();
//博主简称
blogStar.name = box.findElement(By.className("name")).getText();
List<WebElement> dts = box.findElements(By.tagName("dt"));
//排名
blogStar.ranking = dts.get(0).getText();
//分数
blogStar.score = StringUtil.getInts(dts.get(1).getText())[0];
//评分页
blogStar.scorePage = box.findElements(By.tagName("a")).get(2).getAttribute("href");
//blogUrl
blogStar.blogUrl = box.findElement(By.tagName("a")).getAttribute("href");
//录入时间
blogStar.createTime = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyy-MM-dd HH:mm:ss");
blogStars.add(blogStar);
}
lis = driver.findElement(By.className("authorscoring-nav")).findElements(By.tagName("li"));
}
driver.close();
ArrayList<String> heads = new ArrayList<String>();
heads.add("领域");
heads.add("博主简称");
heads.add("领域排名");
heads.add("总评分");
heads.add("参赛互动页");
heads.add("博主首页");
heads.add("录入时间");
//CSVUtils.createCSVFile(heads, blogStars, outPutPath, filename);
System.out.println("Creating excel");
try {
File file = new File(outPutPath + filename + suffix);
inputStream = new FileInputStream(file);
XSSFWorkbook workbook = new XSSFWorkbook(inputStream);
XSSFSheet sheet = workbook.createSheet(sheetname);
//设置列宽
sheet.setColumnWidth(0, 16 * 256);
sheet.setColumnWidth(1, 20 * 256);
sheet.setColumnWidth(2, 10 * 256);
sheet.setColumnWidth(3, 10 * 256);
sheet.setColumnWidth(4, 20 * 256);
sheet.setColumnWidth(5, 20 * 256);
sheet.setColumnWidth(6, 25 * 256);
Row row = null;
Cell cell = null;
//插入第一行数据的表头
//创建第一行
row = sheet.createRow(0);
for (int i = 0; i < heads.size(); i++) {
cell = row.createCell(i);
cell.setCellValue(heads.get(i));
}
int rowNum = 1;
int colNum = 0;
//组合表格:行、列
for (BlogStar blogStar : blogStars) {
row = sheet.createRow(rowNum++);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.field);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.name);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.ranking);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.score);
cell = row.createCell(colNum++);
CreationHelper createHelper = workbook.getCreationHelper();
XSSFHyperlink link = (XSSFHyperlink) createHelper.createHyperlink(HyperlinkType.URL);
link.setAddress(blogStar.scorePage);
cell.setHyperlink(link);
cell.setCellValue(blogStar.scorePage);
cell = row.createCell(colNum++);
XSSFHyperlink link2 = (XSSFHyperlink) createHelper.createHyperlink(HyperlinkType.URL);
link2.setAddress(blogStar.blogUrl);
cell.setHyperlink(link2);
cell.setCellValue(blogStar.blogUrl);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.createTime);
colNum = 0;
}
outputStream = new FileOutputStream(file);
//写入数据到Excel
workbook.write(outputStream);
//关闭流
inputStream.close();
outputStream.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Done");
}
}
/**
* Copyright (C), 2000-2021, XXX有限公司
* FileName: BlogArticleStatistics
* Author: wangyetao
* Date: 21-12-27 05:20:10
* Description: 博主博客文章统计
*
* History:
* 作者姓名
* 修改时间
* 版本号
* 版本描述
*
* wangyetao
* 2021年 12月 27日 星期一 07:18:11 CST
* 版本号
* 最近一次修改
*/
package simple.call.blogstar;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* @ClassName: BlogArticleStatistics
* @Description: java类描述
* @Author: wangyetao
* @Date: 21-12-27 05:20:10
*/
public class BlogArticleStatistics {
private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
private static String filename = "u014132947";
private static String sheetname = "article_" + filename;
private static String suffix = ".xlsx";
private static FileOutputStream outputStream;
private static ArrayList<Article> blogArticles;
private static String url = "https://blog.csdn.net/u014132947";//博主url
//测试用例
public static void main(String[] args) throws InterruptedException {
//预先设置驱动
System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");
// Initialize your driver as you normally would:
ChromeDriver driver = new ChromeDriver();
driver.get(url);
//
//
//单条文章数据DOM结构
//
//
//
// data-report-click="{"spm":"1001.2014.3001.5190"}" target="_blank">
// 原创
// 获取世界人口排名2021
//
//
//
// 获取世界人口排名2021,Linux配置Selenium+Chrome+Java实现自动化测试
//
//
//
// 2021-12-26 06:16:59
// 105
//
//
//
//
//
//
blogArticles = new ArrayList<Article>();
//稍等页面渲染完成
Thread.sleep(2000);
//nextElement
WebElement nextElement = driver.findElement(By.className("js-page-next"));
int dataNum = Integer.valueOf(driver.findElement(By.id("container-header-blog")).getAttribute("data-num"));
while (nextElement != null && blogArticles.size() < dataNum) {
List<WebElement> search_results = driver.findElements(By.className("article-item-box"));
for (int i = 0; i < search_results.size(); i++) {
WebElement element = search_results.get(i);
Article article = new Article();
//文章标题
article.title = element.findElement(By.tagName("a")).getText();
//简要内容
article.content = element.findElement(By.className("content")).getText();
//发布时间
article.publishTime = element.findElement(By.className("date")).getText();
//访问数
article.readNum = StringUtil.getInts(element.findElement(By.className("read-num")).getText())[0];
blogArticles.add(article);
}
nextElement.click();
//稍等页面渲染完成
Thread.sleep(3000);
nextElement = driver.findElement(By.className("js-page-next"));
}
driver.close();
ArrayList<String> heads = new ArrayList<String>();
heads.add("文章标题");
heads.add("简要内容");
heads.add("发布时间");
heads.add("访问数");
//CSVUtils.createCSVFile(heads, blogArticles, outPutPath, filename);
System.out.println("Creating excel");
try {
XSSFWorkbook workbook = new XSSFWorkbook();
XSSFSheet sheet = workbook.createSheet(sheetname);
//设置列宽
for (int i = 0; i < heads.size(); i++) {
if (i == 3) {
sheet.setColumnWidth(i, 6 * 256);
} else {
sheet.setColumnWidth(i, 15 * 256);
}
}
Row row = null;
Cell cell = null;
//插入第一行数据的表头
//创建第一行
row = sheet.createRow(0);
for (int i = 0; i < heads.size(); i++) {
cell = row.createCell(i);
cell.setCellValue(heads.get(i));
}
int rowNum = 1;
int colNum = 0;
//组合表格:行、列
for (Article article : blogArticles) {
row = sheet.createRow(rowNum++);
cell = row.createCell(colNum++);
cell.setCellValue(article.title);
cell = row.createCell(colNum++);
cell.setCellValue(article.content);
cell = row.createCell(colNum++);
cell.setCellValue(article.publishTime);
cell = row.createCell(colNum++);
cell.setCellValue(article.readNum);
colNum = 0;
}
outputStream = new FileOutputStream(outPutPath + filename + suffix);
//写入数据到Excel
workbook.write(outputStream);
//关闭流
outputStream.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Done");
}
}
采集样例
2021线上评分[入围名单TOP100]数据采集|样例程序
/**
* Copyright (C), 2000-2021, XXX有限公司
* FileName: Blogstar2021
* Author: wangyetao
* Date: 2022年 01月 08日 星期六 21:49:17 CST
* Description: 线上评分[入围名单TOP100]数据采集,输出blogstar2021.xlsx
*
* History:
* 作者姓名
* 修改时间
* 版本号
* 版本描述
*
* wangyetao
* 2022年 01月 08日 星期六 22:03:26 CST
* 版本号
* 最近一次修改
*/
package simple.call.blogstar;
import org.apache.poi.common.usermodel.HyperlinkType;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CreationHelper;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFHyperlink;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import simple.call.util.StringUtil;
import simple.call.util.TimeUtil;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
/**
* @ClassName: Blogstar2021
* @Description: java类描述
* @Author: wangyetao
* @Date: 21-12-28 15:50:02
*/
public class Blogstar2021_02 {
private static String outPutPath = "/home/wangyetao/IdeaProjects/_psimplemvn/src/main/java/simple/call/blogstar/";
//预先在指定目录创建好blogstar2021.xlsx Excel文件,防止FileNotFoundException
private static String filename = "blogstar2021";
private static String sheetname = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyyMMddHHmmss");
private static String suffix = ".xlsx";
//用于保留Excel中的原内容
private static FileInputStream inputStream;
//用于往Excel中追加写入新内容
private static FileOutputStream outputStream;
private static ArrayList<BlogStar> blogStars;
private static String url = "https://www.csdn.net/blogstar2021";//blogstar2021 url
//测试用例
public static void main(String[] args) throws InterruptedException {
//预先设置驱动
System.setProperty("webdriver.chrome.driver", "/usr/bin/chromedriver");
// Initialize your driver as you normally would:
ChromeDriver driver = new ChromeDriver();
driver.get(url);
blogStars = new ArrayList<BlogStar>();
//稍等页面渲染完成
Thread.sleep(3000);
List<WebElement> lis = driver.findElement(By.className("authorscoring-nav")).findElements(By.tagName("li"));
for (int i = 0; i < lis.size(); i++) {
WebElement element = lis.get(i);
element.click();
Thread.sleep(2000);
List<WebElement> boxs = driver.findElements(By.className("authorscoring-cont-box"));
for (int j = 0; j < boxs.size(); j++) {
WebElement box = boxs.get(j);
BlogStar blogStar = new BlogStar();
//领域
blogStar.field = element.getText();
//博主简称
blogStar.name = box.findElement(By.className("name")).getText();
//排名
List<WebElement> dts = box.findElements(By.tagName("dt"));
blogStar.ranking = dts.get(0).getText();
//分数
blogStar.score = StringUtil.getInts(dts.get(1).getText())[0];
//评分页
blogStar.scorePage = box.findElements(By.tagName("a")).get(2).getAttribute("href");
//blogUrl
blogStar.blogUrl = box.findElement(By.tagName("a")).getAttribute("href");
//录入时间
blogStar.createTime = TimeUtil.getStampToString(System.currentTimeMillis(), "yyyy-MM-dd HH:mm:ss");
blogStars.add(blogStar);
}
lis = driver.findElement(By.className("authorscoring-nav")).findElements(By.tagName("li"));
}
driver.close();
ArrayList<String> heads = new ArrayList<String>();
heads.add("领域");
heads.add("博主简称");
heads.add("领域排名");
heads.add("总评分");
heads.add("参赛互动页");
heads.add("博主首页");
heads.add("录入时间");
//CSVUtils.createCSVFile(heads, blogStars, outPutPath, filename);
System.out.println("Creating excel");
try {
File file = new File(outPutPath + filename + suffix);
inputStream = new FileInputStream(file);
XSSFWorkbook workbook = new XSSFWorkbook(inputStream);
XSSFSheet sheet = workbook.createSheet(sheetname);
//设置列宽
sheet.setColumnWidth(0, 16 * 256);
sheet.setColumnWidth(1, 20 * 256);
sheet.setColumnWidth(2, 10 * 256);
sheet.setColumnWidth(3, 10 * 256);
sheet.setColumnWidth(4, 20 * 256);
sheet.setColumnWidth(5, 20 * 256);
sheet.setColumnWidth(6, 25 * 256);
Row row = null;
Cell cell = null;
//插入第一行数据的表头
//创建第一行
row = sheet.createRow(0);
for (int i = 0; i < heads.size(); i++) {
cell = row.createCell(i);
cell.setCellValue(heads.get(i));
}
int rowNum = 1;
int colNum = 0;
//组合表格:行、列
for (BlogStar blogStar : blogStars) {
row = sheet.createRow(rowNum++);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.field);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.name);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.ranking);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.score);
cell = row.createCell(colNum++);
CreationHelper createHelper = workbook.getCreationHelper();
XSSFHyperlink link = (XSSFHyperlink) createHelper.createHyperlink(HyperlinkType.URL);
link.setAddress(blogStar.scorePage);
cell.setHyperlink(link);
cell.setCellValue(blogStar.scorePage);
cell = row.createCell(colNum++);
XSSFHyperlink link2 = (XSSFHyperlink) createHelper.createHyperlink(HyperlinkType.URL);
link2.setAddress(blogStar.blogUrl);
cell.setHyperlink(link2);
cell.setCellValue(blogStar.blogUrl);
cell = row.createCell(colNum++);
cell.setCellValue(blogStar.createTime);
colNum = 0;
}
outputStream = new FileOutputStream(file);
//写入数据到Excel
workbook.write(outputStream);
//关闭流
inputStream.close();
outputStream.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
System.out.println("Done");
}
}
采集样例
作于2021年 12月 27日 星期一 04:02:17 CST,归档于2021年 12月 27日 星期一 20:48:42 CST。