我使用的是Maven去管理架包的
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<!-- 操作http-->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.5</version>
</dependency>
<!-- java爬虫-->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<!-- selenium爬虫-->
<dependency>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>2.44.0</version>
</dependency>
<!-- 导入数据 -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>4.0.1</version>
</dependency>
Selenium是一个用于Web应用程序测试的工具。Selenium测试直接运行在浏览器中,所以我们要下载浏览器对应的工具。
下载地址
http://npm.taobao.org/mirrors/chromedriver/
下载的版本一定要与自己的chrome浏览器对应
查看:点击浏览器右上角的菜单,帮助—>关于Google Chrome
定位元素
selenium的提供了八种定位方式来帮助我们获取数据
By.Id() //Id查找
By.Name() //name属性查找
By.tagName() //标签名查找
By.className() //class查找
By.Xpath() //路径查找
By.CssSelector() //css样式查找
By.linkText() //超文本链接上的文字信息来定位元素
By.partialLinkText() //超文本链接上的文字匹配
详细请参考
https://www.cnblogs.com/hustar0102/p/5965095.html
模拟点击
先查找到指定元素,driver.findElement(By.你的方法).click()
以爬取古诗词网的诗经为例,获取其中311篇诗文的内容、作者、朝代、译文、注释、赏析等信息
https://so.gushiwen.org/gushi/shijing.aspx
操作步骤
先获取诗经篇章的链接地址,再次使用httpClient和jsoup(获取静态资源速度快,selenium是要操作浏览器,需要许多加载时间)
1、通过httpClient获取html的字符串
public String parse(String url) throws IOException {
if(url==null){
System.err.println("地址为空!解析失败");
return null;
}
String content=null;
CloseableHttpClient httpClient = HttpClients.createDefault();//创建httpClient
HttpGet httpGet = new HttpGet(url);//创建httpget实例
CloseableHttpResponse response = httpClient.execute(httpGet);//执行get请求
if(response.getStatusLine().getStatusCode()==200) {
HttpEntity entity = response.getEntity();//获取返回实体
//使用工具类EntityUtils,从响应中取出实体表示的内容并转换成字符串
content = EntityUtils.toString(entity, "utf-8");
}
response.close();//关闭流和释放系统资源
return content;
}
获取诗经诗文的url,一个篇章封装成一个List
public Map<String,List<Chapter>> getData()throws IOException{
String content=new GetUrlContent().parse("https://so.gushiwen.org/gushi/shijing.aspx");
//解析数据
Map<String, List<Chapter>> map=new HashMap<String,List<Chapter>>();
Document doc = Jsoup.parse(content);//解析网页得到文档对象
Elements elements = doc.getElementsByClass("typecont");//获取tag是title的所有dom文档
for(Element element:elements){
List<Chapter> chapterList=new ArrayList<Chapter>();
String info = element.text(); //.html是返回html
String[] values=info.split(" ");
//拿到typecont下的所有a标签
Elements urls=element.select("a");
for(int i=1;i<values.length;i++){
Chapter chapter=new Chapter();
chapter.setName(values[i]);
chapter.setUrl("https://so.gushiwen.org"+urls.get(i-1).attr("href"));
chapterList.add(chapter);
System.out.println(chapter.toString());
}
//设置章节名和具体数据
map.put(values[0],chapterList);
}
return map;
}
2、获取每一篇诗文的详细数据
为提高运行速度可禁止加载图片、css等
public List<Poem> getData() throws IOException {
Random random = new Random(System.currentTimeMillis());
DesiredCapabilities capabilities = DesiredCapabilities.chrome();
capabilities.setCapability("pageLoadStrategy", "none");
System.setProperty("webdriver.chrome.driver","C:\\Users\\admin\\AppData\\Local\\Google\\Chrome\\Application\\chromedriver.exe");
ChromeOptions options = new ChromeOptions();
// 创建HashMap类的一个对象
Map<String, Object> prefs = new HashMap<String, Object>();
// 设置提醒的设置,2表示block
prefs.put("profile.default_content_setting_values.notifications", 2);
prefs.put("profile.managed_default_content_settings.images",2); //不加载图片
options.setExperimentalOption("prefs",prefs);
//options.setPageLoadStrategy(PageLoadStrategy.NONE); //设置加载策略
WebDriver driver =new ChromeDriver(options);
List<Poem> poems =new ArrayList<Poem>();
int add=0;
Map<String,List<Chapter>> map=new FetchChapter().getData();
for (String key : map.keySet()) {
List<Chapter> list=map.get(key);
for(Chapter chapter:list) {
Poem poem=new Poem();
poem.setChapter(key);
poem.setName(chapter.getName());
if((chapter.getUrl().length()>30)){
driver.get(chapter.getUrl());
System.err.println("url"+chapter.getUrl());
System.out.println("跑了"+(++add)+"次");
// 等待页面动态加载完毕
// 等待数据加载的时间
// 为了防止服务器封锁,这里的时间要模拟人的行为,随机且不能太短
//Thread.sleep(random.nextInt(1000));
//Actions action = new Actions(driver);
List<WebElement> elements=driver.findElements(By.className("contyishang"));
//Thread.sleep(waitLoadBaseTime+random.nextInt(2000));
//先把隐藏部分模拟点击
for(int i=0;i<elements.size();i++){
// if(elements.get(i).isDisplayed()) { //判断是否被隐藏
if(elements.get(i).findElements(By.tagName("a")).size()>1){
elements.get(i).findElements(By.tagName("a")).get(1).click();
}
// }
}
WebElement head=driver.findElement(By.className("sons"));
// Lista=head.findElement(By.className("source")).findElements(By.tagName("a"));
//poem.setDynasty(a.get(0).getText()); //添加朝代
// poem.setAuthor(a.get(1).getText()); //添加作者
String poemContent=head.findElement(By.className("contson")).getText();
poem.setContent(poemContent);
//重新获取数据
elements=driver.findElements(By.className("contyishang"));
for(int i=0;i<elements.size();i++){
String poemInfo=elements.get(i).findElement(By.tagName("h2")).getText();
if (poemInfo.equals("译文及注释")) {
List<WebElement> fanyi = elements.get(i).findElements(By.tagName("p"));
if(fanyi.size()>1) {
poem.setTranslation(fanyi.get(0).getText());
poem.setAnnotation(fanyi.get(1).getText());
}else{
poem.setTranslation(fanyi.get(0).getText());
}
}
if (poemInfo.equals("赏析")) {
List<WebElement> sanxi = elements.get(i).findElements(By.tagName("p"));
StringBuffer anaylse=new StringBuffer();
for(WebElement e:sanxi){
anaylse.append(e.getText()+"\n");
}
poem.setAnalyse(anaylse.toString());
}
if (poemInfo.equals("创作背景")) {
List<WebElement> beijin = elements.get(i).findElements(By.tagName("p"));
StringBuffer story=new StringBuffer();
for(WebElement e:beijin){
story.append(e.getText()+"\n");
}
poem.setStory(story.toString());
}
if (poemInfo.equals("鉴赏")) {
List<WebElement> jiansan = elements.get(i).findElements(By.tagName("p"));
StringBuffer appreciation=new StringBuffer();
for(WebElement e:jiansan){
appreciation.append(e.getText()+"\n");
}
poem.setAppreciation(appreciation.toString());
}
}
}
//System.out.println(poem.toString());
poems.add(poem);
}
}
driver.quit();
return poems;
}
public static void main(String[] args) throws IOException {
List<Poem> poems=new seleniumTest().getData();
System.out.println("我已经读取好啦");
//根据Excel文件创建工作簿
Workbook wb=new XSSFWorkbook("C:\\Users\\admin\\Desktop\\诗词内容表.xlsx");
//获取sheet 第几个表
Sheet sheet=wb.getSheetAt(0);// 参数索引
//5.抽取公共样式
Row row = sheet.getRow(2);
CellStyle styles [] = new CellStyle[row.getLastCellNum()];
for(int i=0;i<row.getLastCellNum();i++) {
Cell cell = row.getCell(i);
styles[i] = cell.getCellStyle();
}
//6.构造单元格
int rowIndex = 4;
Cell cell = null;
for(Poem poem:poems) {
//创建行对象、参数、索引
row = sheet.createRow(rowIndex++);
//设置行高
//row.setHeightInPoints(20);
//创建单元格、列宽
cell = row.createCell(3);
cell.setCellStyle(styles[3]);
cell.setCellValue(poem.getName());
//sheet.setColumnWidth(0,20*256);
cell = row.createCell(6);
cell.setCellStyle(styles[6]);
cell.setCellValue(poem.getChapter());
cell = row.createCell(8);
cell.setCellStyle(styles[8]);
cell.setCellValue(poem.getContent());
cell = row.createCell(9);
cell.setCellStyle(styles[9]);
cell.setCellValue(poem.getAnnotation());
cell = row.createCell(10);
cell.setCellStyle(styles[10]);
cell.setCellValue(poem.getTranslation());
cell = row.createCell(19);
cell.setCellStyle(styles[19]);
cell.setCellValue(poem.getAppreciation());
cell = row.createCell(21);
cell.setCellStyle(styles[21]);
cell.setCellValue(poem.getAnalyse());
cell = row.createCell(22);
cell.setCellStyle(styles[22]);
cell.setCellValue(poem.getStory());
}
//文件流
FileOutputStream file=new FileOutputStream("C:\\Users\\admin\\Desktop\\test.xlsx");
//写入文件
wb.write(file);
file.close();
}
Gitee下载
https://gitee.com/g_x_liu/javapachongselenium