https://blog.csdn.net/qq_29914837/article/details/89309298
在使用爬虫的过程中,有的网站的信息必须是要登录后才能查看的,比如CSDN网站中
管理博客,必须要登陆后才会显示。
如果我想要爬取到这些信息,肯定是要进行登陆才可以的,这里就要获取到cookie的信息。
进入CSDN博客,登陆后,F12打开DevTools页面,查看到cookie信息
最简单的做法是将cookie信息保存下来。
二、获取cookie模拟登陆
package demo.blog.csdn.net3.model;
import java.util.Date;
import java.util.List;
import us.codecraft.webmagic.model.annotation.ExtractByUrl;
public class CsdnBlog {
//标题
private String article="";
//发布日期
private String time;
//作者
private String nick_name="";
//阅读数
private int read_count;
//标签
private List labelList;
private String label="";
//分类
private List categoryList;
private String category="";
//内容
private String content="";
//链接
@ExtractByUrl
private String url="";
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
//采集时间
private Date collect_time;
public Date getCollect_time() {
return collect_time;
}
public void setCollect_time(Date collect_time) {
this.collect_time = collect_time;
}
public String getArticle() {
return article;
}
public void setArticle(String article) {
this.article = article;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public String getNick_name() {
return nick_name;
}
public void setNick_name(String nick_name) {
this.nick_name = nick_name;
}
public int getRead_count() {
return read_count;
}
public void setRead_count(int read_count) {
this.read_count = read_count;
}
public List getLabelList() {
return labelList;
}
public void setLabelList(List labelList) {
this.labelList = labelList;
}
public List getCategoryList() {
return categoryList;
}
public void setCategoryList(List categoryList) {
this.categoryList = categoryList;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getLabel() {
return label;
}
public void setLabel(String label) {
this.label = label;
}
public String getCategory() {
return category;
}
public void setCategory(String category) {
this.category = category;
}
}
package demo.blog.csdn.net3;
import org.apache.log4j.Logger;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
/**
* 爬取网址:https://mp.csdn.net/postlist
* 网络爬虫模拟登陆[策略一:获取cookie]
* @author yl
*/
public class CsdnBlogCrawler2 implements PageProcessor{
public static final String csdn_name = "qq_29914837";
private Logger logger = Logger.getLogger(CsdnBlogCrawler2.class);
//=".addCookie("""&A2&""","""&B2&""")"
private Site site = Site.me().setDomain("blog.csdn.net").setSleepTime(3000).setUserAgent(
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36")
// 【重要】:以下信息可以模拟登陆,信息全部来自于浏览器
.addCookie("ADHOC_MEMBERSHIP_CLIENT_ID1.0","aa471269-79de-8522-560a-0a252c67c602")
.addCookie("ARK_ID","JSe3a24bd0201fd5540926eb969f751da5e3a2")
.addCookie("AU","522")
.addCookie("BAIDUID","CB366337BFAF861341D00A8BD9C10D92:FG=1")
.addCookie("BAIDU_SSP_lcr","https://graph.qq.com/oauth2.0/show?which=Login&display=pc&client_id=100270989&response_type=code&redirect_uri=https%3A%2F%2Fpassport.csdn.net%2Faccount%2Flogin%3FpcAuthType%3Dqq%26state%3Dtest")
.addCookie("BDORZ","B490B5EBF6F3CD402E515D22BCDA1598")
.addCookie("BIDUPSID","3A63C0C9EF7F0594394D19B30119BF80")
.addCookie("BT","1554359603673")
.addCookie("CloudGuest","gWZ1tVdz/I9ISdDDIcZAJ2ok+osCgdixwMtOINtrVDiBpmeXweFyO0LAfiT6xA+jmpnyqqyDqJyukq5YUFGSmOItZgEffYovFmxs6rN5Adh3GvTJJdxVr7rNA9KyR8QmYLwlMCwLtA5cHWrPbvKZj6HsYSSNwLzJmisTdq8a5dL9VwPrtTba4Nxoa0j/NdRc")
.addCookie("HMACCOUNT","2F0945C300F7BCF8")
.addCookie("HMVT","6bcd52f51e9b3dce32bec4a3997715ac|1554359627|")
.addCookie("H_PS_PSSID","1450_211031_28768_28724_28558_28585_28603_28625_28605")
.addCookie("Hm_ct_6bcd52f151e9b3dce32bec4a3997715ac","1788*1*PC_VC!5744*1*qq_29914837!6525*1*10_28867322920-1540868724025-839757")
.addCookie("Hm_lpvt2_6bcd52f51e9b3dce32bec4a3997715ac","1554359624")
.addCookie("Hm_lpvt_e193a8b00cf63f716d774540875007664","1554284443")
.addCookie("Hm_lvt_20bba81dc5fa07f97ba1779a51ed918a","1535625612")
.addCookie("Hm_lvt_6bcd452f51e9b3dce32bec4a3997715ac","1.55435758015543E+39")
.addCookie("Hm_lvt_e159a8b00cf63f716d774540875007664","1.55420144215542E+29")
.addCookie("PSTM","15541604765")
.addCookie("SESSION","f4fa5caa-c193a-486e-94c9-c7c34d06d5f6")
.addCookie("TINGYUN_DATA","%7B2%22id%22%3A%22-sf2Cni530g%23HL5wvli0FZI%22%2C%22n%22%3A%22WebAction%2FCI%2FarticleList%252Flist%22%2C%22tid%22%3A%226e6120ef5abb6c%22%2C%22q%22%3A0%2C%22a%22%3A53%7D")
.addCookie("UM_distinctid","1661c314ab153d2-0e71d4a46e8aba-5b163f13-100200-166c314ab1626a")
.addCookie("UN","qq_29914837")
.addCookie("UserInfo","5c847850d7194e9e94aeba95ee66e2fc")
.addCookie("UserName","qq_29914837")
.addCookie("UserNick","%E7%BD%91%E7%95%8C%E5%85%AD%E5%85%AD%E5%B1%85%E5%A3%AB")
.addCookie("UserToken","5c847850d7194e9e94aeba95ee66e2fc")
.addCookie("__cfduid","db6ad5fcdbbbab25d6c4ecfc6e9e739a01536029112")
.addCookie("__utma","17226283.271695300.1537435940.1537435940.1540455490.2")
.addCookie("__yadk_uid","6eUT39xr0udoIWIg5eO5F68Va2RpVKJP")
.addCookie("_ga","GA1.2.2716952300.1537435940")
.addCookie("_ga","GA1.2.708462049.1544146269")
.addCookie("_gid","GA1.2.2055866324.1554175116")
.addCookie("_gid","GA1.2.622036880.1554201444")
.addCookie("_javaeye3_session_","BAh7BjoPc2Vzc2lvbl9pZCIlMzFhNjk2OThkZjcwNjhkN2EzNGNiM2VhN2U2NWQ1MjA%3D--bf5b5979573e5d40d105a8c446f44dcd9f8be422")
.addCookie("_javaeye_cookie_id_","1523948857395170")
.addCookie("aliyun_UAToken","115#1EB8U51O1TN2wLPsTCZE1CsoE562CpA11g2mOCXw81OEDUlCKOm6ICtuKXRhyzFGvSfyQ+T8y5jLi/JJhUU4AkNca8pAurPQOSfyetT8ukZQgQkRhEPCOSgaCY9XuzFZASAyeKT8ukNQiFMJhUU4AWNcadyXyzFQOSRlTRDv53bK1CX5H/Gsz1wQSFAF5U6xPs/xw07PLxsuRkxndqe5m/o1cdCsQXS1Z1KagrHPhOFE9HQo2yTHCghsvbQadvn0u5OVLV5yX/66eKcuCwfctxVOk1AEqpKEX+BeHfBOQPb4J5D8FDtoUs9iu78xnJd8xtZvOg9spt62hfQOmfGRIn9kUZye8bGQmXaEXFWgHYx5Xc2czQToq+Mcjx2fLknAnWM41YENsnAdxlaZ2yrbB5wJnQ07tUG87eLcaIKARnolBg4sMBbegVt8lg2NmpoPbCBhEB/sRqmXz0oZpuRkKkS0ddZcBP7kVZBXu3Lmuxt7pIDqSH0c0pe7FCfBWwM5BtSf+uH8/eQK7WxxhRsR1inqB525JuFw+b8ig9kJR4T1HLlaIk+dWZWZa9O0s6PCKoeamo2Ogh8F2Kn8mpfsgiLNr3VVy+gkyBnPakA5UzNF/Ec6cFLj+RVxycWQ/schVL0dYPWwiOLfK7LxmkxO2f==")
.addCookie("aliyun_webUmidToken","TB638344B44800204EB5241A1AD83D46B2CDC42BE26B2D28D4209FF2C8B")
.addCookie("blogTipShow","TRUE")
.addCookie("dc_session_id","10_1554111375664.355014")
.addCookie("dc_tos","ppf2bk7")
.addCookie("smidV2","201806212304243cff744f8b1cb4ab13015660c60ccf38e00d97e7e0f187fbb0")
.addCookie("uuid_tt_dd","10_288673222920-1540868724025-839757")
.addCookie("yidun_tocken","9ca17a2e2e6ffcda170e2e6eed2aa4ebcb7a2abd252a29a8fa7c15e928b9faff268f6908cb9e521ac9ae5b5ae2af0feaec3b92abb9abc87e45394efbca5f64a878f9fa7d15ba8ebb892d35998b59abaef5eaef0ee9e")
;
@Override
public void process(Page page) {
//如果打印 “管理博客“ 代码模拟登录成功
System.out.println(page.getHtml().xpath("//div[@class='opt-box d-flex justify-content-end']//a/text()").toString());
}
@Override
public Site getSite() {
return site;
}
public static void main(String[] args) {
Spider.create(new CsdnBlogCrawler2())
.addUrl("http://blog.csdn.net/"+csdn_name).thread(1).run();
}
}
获取cookie信息,如果控制台输出【管理博客】四个字信息,代表模拟登陆成功
如果你觉得本篇文章对你有所帮助的话,麻烦请点击头像右边的关注按钮,谢谢!
技术在交流中进步,知识在分享中传播