本代码主要是自学了webmagic后,想实际找个项目练手,因此写了这点代码
package us.codecraft.webmagic.downloader;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.openqa.selenium.Cookie;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.JsonPathSelector;
import us.codecraft.webmagic.utils.HttpConstant;
import java.util.HashMap;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import static us.codecraft.webmagic.downloader.SeleniumTest.addRequestHeader;
/**
*@Description
*@Author Evan
*@Date 2020/3/12 20:15
*/
public class LaGouProcessor implements PageProcessor {
/**
* 总页数
*/
private AtomicInteger pageTotal = new AtomicInteger(30);
/**
* 当前页号
*/
private AtomicInteger currentNo = new AtomicInteger(1);
/**
* 计数器
*/
private AtomicInteger record = new AtomicInteger(0);
/**
* showid 每次分页请求后,都可能会变化,因此要记录。
*/
private volatile String showId = "";
/**
* 的反爬机制 XHHTP-COOKIE
*/
private volatile List<org.apache.http.cookie.Cookie> XHTTP_COOKIES;
Site site;
public LaGouProcessor(Site site) {
this.site = site;
}
@Override
public void process(Page page) {
handlePage(page);
}
@Override
public Site getSite() {
return site;
}
/**
* 处理page页面
* @param page
*/
private void handlePage (Page page) {
System.out.println("抓取到的数据:"+page.getRawText());
List<String> list = new JsonPathSelector(("$.content.positionResult.result")).selectList(page.getRawText());
String totalCount = new JsonPathSelector(("$.content.positionResult.totalCount")).select(page.getRawText());
// 下一次分页查询的参数 sid:ef76aba27ec44eba80c54131f4b626af
showId = new JsonPathSelector(("$.content.showId")).select(page.getRawText());
int total = Integer.parseInt(totalCount);
if (list!=null &&!list.isEmpty()) {
JSONArray array = (JSONArray) JSONArray.parse(list.toString());
StringBuilder builder = new StringBuilder();
for (int i =0 ;i<array.size();i++) {
Object buk = array.get(i);
Object positionName = ((JSONObject) buk).get("positionName");
Object companyFullName = ((JSONObject) buk).get("companyFullName");
Object companyShortName = ((JSONObject) buk).get("companyShortName");
Object companySize = ((JSONObject) buk).get("companySize");
Object industryField = ((JSONObject) buk).get("industryField");
Object financeStage = ((JSONObject) buk).get("financeStage");
Object firstType = ((JSONObject) buk).get("firstType");
Object secondType = ((JSONObject) buk).get("secondType");
Object city = ((JSONObject) buk).get("city");
Object district = ((JSONObject) buk).get("district");
Object businessZones = ((JSONObject) buk).get("businessZones");
Object salary = ((JSONObject) buk).get("salary");
Object jobNature = ((JSONObject) buk).get("jobNature");
Object education = ((JSONObject) buk).get("education");
Object positionAdvantage = ((JSONObject) buk).get("positionAdvantage");
builder.append(positionName)
.append(companyFullName).append(" ")
.append(companyShortName).append(" ")
.append(companySize).append(" ")
.append(industryField).append(" ")
.append(financeStage).append(" ")
.append(firstType).append(" ")
.append(secondType).append(" ")
.append(city).append(" ")
.append(district).append(" ")
.append(businessZones).append(" ")
.append(salary).append(" ")
.append(jobNature).append(" ")
.append(education).append(" ")
.append(positionAdvantage)
.append("\r\n");
page.putField("No."+record.incrementAndGet(),builder.toString());
builder.delete(0,builder.length());
}
}
if (currentNo.get()<pageTotal.get()) {
Request request = new Request("https://www.lagou.com/jobs/positionAjax.json?px=default&city=广州&needAddtionalResult=false");
HashMap<String, Object> params = new HashMap<String, Object>();
params.put("first","fasle");
params.put("pn",currentNo.incrementAndGet());
params.put("kd","java");
params.put("sid",showId);
request.setExtras(params);
request.setMethod(HttpConstant.Method.POST);
HttpRequestBody body = HttpRequestBody.form(params, "utf-8");
request.setRequestBody(body);
Set<Cookie> cookies = SeleniumTest.cookieStore;
if (cookies!=null && cookies.size()>0) {
for (Cookie cookie : cookies) {
String cookieName = cookie.getName();
String cookieValue = cookie.getValue();
request.addCookie(cookieName,cookieValue);
}
if (page.getCookies().size()>0) {
XHTTP_COOKIES = page.getCookies();
}
if (XHTTP_COOKIES!=null && XHTTP_COOKIES.size()>0) {
for (org.apache.http.cookie.Cookie cookie :XHTTP_COOKIES) {
System.out.println("======防止被识别添加cookie "+cookie.getName()+" "+cookie.getValue());
String cookieName = cookie.getName();
String cookieValue = cookie.getValue();
request.addCookie(cookieName.trim(),cookieValue.trim());
}
}
addRequestHeader(request);
}
try {
Thread.sleep(20);
} catch (InterruptedException e) {
e.printStackTrace();
}
System.out.println("添加第"+currentNo.get()+"页数据请求任务"+params);
page.addTargetRequest(request);
}
}
}
package us.codecraft.webmagic.downloader;
import org.junit.Ignore;
import org.junit.Test;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.remote.DesiredCapabilities;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.utils.HttpConstant;
import java.io.File;
import java.util.*;
public class SeleniumTest {
public static Set<Cookie> cookieStore;
@Ignore("need chrome driver")
@Test
public void testSelenium() {
System.getProperties().setProperty("webdriver.chrome.driver", "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe");
Map<String, Object> contentSettings = new HashMap<String, Object>();
contentSettings.put("images", 2);
Map<String, Object> preferences = new HashMap<String, Object>();
preferences.put("profile.default_content_settings", contentSettings);
DesiredCapabilities caps = DesiredCapabilities.chrome();
caps.setCapability("chrome.prefs", preferences);
caps.setCapability("chrome.switches", Arrays.asList("--user-data-dir=F:\\data\\webmagic\\chrome"));
WebDriver webDriver = new ChromeDriver(caps);
webDriver.get("https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=?labelWords=hot");
WebDriver.Options manage = webDriver.manage();
Set<Cookie> cookies = manage.getCookies();
System.out.println("======cookies====="+cookies.toString());
// WebElement webElement = webDriver.findElement(By.xpath("/html"));
WebElement webElement = webDriver.findElement(By.xpath("/html/body/div[9]/div/div[2]"));
System.out.println(webElement.getAttribute("outerHTML"));
webElement.click();
webDriver.close();
Site site = Site.me()
.setRetryTimes(3)
.setSleepTime(5000)
.setCharset("UTF-8");
Request request = new Request("https://www.lagou.com/jobs/positionAjax.json?px=default&city=广州&needAddtionalResult=false");
Map<String, Object> params = new HashMap<String, Object>();
params.put("first","true");
params.put("pn",1);
params.put("kd","java");
request.setExtras(params);
request.setMethod(HttpConstant.Method.POST);
HttpRequestBody body = HttpRequestBody.form(params, "utf-8");
request.setRequestBody(body);
// 获取cookie 请求职位列表接口
if (cookies!=null && cookies.size()>0) {
cookieStore = cookies;
for (Cookie cookie : cookies) {
String cookieName = cookie.getName();
String cookieValue = cookie.getValue();
System.out.println(String.format("cookieName:%s cookieValue:%s",cookieName,cookieValue));
request.addCookie(cookieName,cookieValue);
}
addRequestHeader(request);
}
// 启动爬虫
System.out.println("启动爬虫....");
File file = new File("");
Spider.create(new LaGouProcessor(site))
.addRequest(request)
.addPipeline(new FilePipeline(file.getAbsolutePath()+File.separator+"拉勾网职位信息"))
.thread(1) //防止被反爬机制识破,这里只用了一个线程爬取
.run();
}
public static void addRequestHeader(Request request) {
request.addHeader("Accept","application/json, text/javascript, */*; q=0.01");
request.addHeader("Accept-Language","zh-CN,zh;q=0.9");
request.addHeader("Accept-Encoding","gzip, deflate, br");
request.addHeader("Content-Type","application/x-www-form-urlencoded; charset=UTF-8");
request.addHeader("Cache-Contro","no-cache");
request.addHeader("Host","www.lagou.com");
request.addHeader("Origin","https://www.lagou.com");
request.addHeader("Pragma","no-cache");
request.addHeader("Referer","https://www.lagou.com/jobs/list_java?labelWords=&fromSearch=true&suginput=?labelWords=hot");
request.addHeader("Sec-Fetch-Dest","empty");
request.addHeader("ec-Fetch-Mode","cors");
request.addHeader("Sec-Fetch-Site","same-origin");
request.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36");
request.addHeader("X-Anit-Forge-Code","0");
request.addHeader("X-Anit-Forge-Token","None");
request.addHeader("X-Requested-With","XMLHttpRequest");
}
}
太懒了,不想写太多字,搞了两三天,累死了,随便记录一下
破解过程记录: