webmagic爬虫自学(六)网络爬虫模拟登陆[策略二:通过Selenium模拟表单提交]

一、搭建webmagic项目环境部分代码,请参考

https://blog.csdn.net/qq_29914837/article/details/89309298

二、网络爬虫模拟登陆[策略二:通过Selenium模拟表单提交]

webmagic爬虫自学(六)网络爬虫模拟登陆[策略二:通过Selenium模拟表单提交]_第1张图片

三、搭建Selenium自动化环境

1、首先需要下载好对应自己chrome对应的chromedriver

我们在地址栏中输入chrome://version可以查看到更加详细的版本情况。

不仅可以显示浏览器版本,还显示了用户代理,Flash版本,路径和状态信息等

webmagic爬虫自学(六)网络爬虫模拟登陆[策略二:通过Selenium模拟表单提交]_第2张图片
2、根据自己的chrome下载对应的chromedriver版本(可以通过这里查找)
http://chromedriver.storage.googleapis.com/index.html

webmagic爬虫自学(六)网络爬虫模拟登陆[策略二:通过Selenium模拟表单提交]_第3张图片

3、下载完毕后,本地指定一个文件夹
webmagic爬虫自学(六)网络爬虫模拟登陆[策略二:通过Selenium模拟表单提交]_第4张图片
4、下载后将chromedriver路径添加到系统环境变量path中
webmagic爬虫自学(六)网络爬虫模拟登陆[策略二:通过Selenium模拟表单提交]_第5张图片

四、webmagic通过Selenium模拟表单提交

package demo.blog.csdn.net3.model;

import java.util.Date;
import java.util.List;

import us.codecraft.webmagic.model.annotation.ExtractByUrl;

public class CsdnBlog {


	  //标题
	  private String article="";
	  
	  //发布日期
	  private String time;
	  
	  //作者
	  private String nick_name="";
	  
	  
	  //阅读数
	  private int  read_count;
	  
	  //标签
	  private List labelList;
	  private String label="";
	  
	  //分类
	  private  List categoryList;
	  private String category="";
	  
	  //内容
	  private String content="";
	  
	  //链接
	  @ExtractByUrl
	  private String url="";
	  
	  public String getUrl() {
		return url;
	}

	public void setUrl(String url) {
		this.url = url;
	}

	//采集时间
	  private Date collect_time;

	public Date getCollect_time() {
		return collect_time;
	}

	public void setCollect_time(Date collect_time) {
		this.collect_time = collect_time;
	}

	public String getArticle() {
		return article;
	}

	public void setArticle(String article) {
		this.article = article;
	}

	public String getTime() {
		return time;
	}

	public void setTime(String time) {
		this.time = time;
	}

	public String getNick_name() {
		return nick_name;
	}

	public void setNick_name(String nick_name) {
		this.nick_name = nick_name;
	}

	public int getRead_count() {
		return read_count;
	}

	public void setRead_count(int read_count) {
		this.read_count = read_count;
	}

	public List getLabelList() {
		return labelList;
	}

	public void setLabelList(List labelList) {
		this.labelList = labelList;
	}

	public List  getCategoryList() {
		return categoryList;
	}

	public void setCategoryList(List  categoryList) {
		this.categoryList = categoryList;
	}

	public String getContent() {
		return content;
	}

	public void setContent(String content) {
		this.content = content;
	}

	public String getLabel() {
		return label;
	}

	public void setLabel(String label) {
		this.label = label;
	}

	public String getCategory() {
		return category;
	}

	public void setCategory(String category) {
		this.category = category;
	}

}

账号和密码输入本人就可以了,在这里插入图片描述
输入前面下载的路径即可

package demo.blog.csdn.net3;

import java.util.Set;

import org.apache.log4j.Logger;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

/**
 * 爬取网址:https://mp.csdn.net/postlist
 * 网络爬虫模拟登陆[策略二:通过Selenium模拟表单提交]
 * @author yl
 */
public class CsdnBlogCrawler implements PageProcessor{

	private static String csdn_name = "qq_29914837"; 
	private static String username = "XXXXXXXX"; //账号
	private static String password = "XXXXXXXXX"; //密码
	
	private Logger logger = Logger.getLogger(CsdnBlogCrawler.class);
	
	private Set cookies;
    private Site site = Site.me().setRetryTimes(3).setSleepTime(0).setTimeOut(3000).setUserAgent(
			"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36");
	
	@Override
	public void process(Page page) {
		System.out.println(page.getHtml().xpath("//div[@class='opt-box d-flex justify-content-end']//a/text()").toString());
	}

	@Override
	public Site getSite() {
		for(Cookie cookie:cookies) {
            site.addCookie(cookie.getName().toString(), cookie.getValue().toString());
        }
        return site;
	}

	public void Login() {

        System.setProperty("webdriver.chrome.driver",
                "D:\\driver\\chromedriver.exe");
        WebDriver driver = new ChromeDriver();
        driver.get("https://passport.csdn.net/account/login");
        driver.findElement(By.className("main-select")).findElement(By.xpath("//ul//li[2]")).click();

        // 防止页面未能及时加载出来而设置一段时间延迟
        try {
            Thread.sleep(1000);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        driver.findElement(By.id("all")).sendKeys(username);
        driver.findElement(By.id("password-number")).sendKeys(password);
        driver.findElement(By.xpath("//form//div//div[@class='form-group']//div//button")).click();
        try {
        	// 防止页面未能及时加载出来而设置一段时间延迟
            Thread.sleep(1000);
            
        } catch (InterruptedException e) {
        	
            e.printStackTrace();
            
        }finally {
        	
        	 cookies=driver.manage().getCookies();
             driver.close();
		}
       
    }
 
	
	public static void main(String[] args) {
 
		CsdnBlogCrawler csdnBlogCrawler = new CsdnBlogCrawler();
		csdnBlogCrawler.Login();
        Spider.create(csdnBlogCrawler).addPipeline(null)
        .addUrl("http://blog.csdn.net/"+csdn_name).thread(1).run();
		
	}
 
	
}

运行程序后,程序会根据前面写好的url
driver.get(“https://passport.csdn.net/account/login”);
会自动打开浏览器,进入登录界面。自动模拟用户操作,输入账号和密码进行登录。


如果你觉得本篇文章对你有所帮助的话,麻烦请点击头像右边的关注按钮,谢谢!

技术在交流中进步,知识在分享中传播

你可能感兴趣的:(webmagic爬虫)