webmagic爬取lg职位信息

本代码主要是自学了webmagic后,想实际找个项目练手,因此写了这点代码

package us.codecraft.webmagic.downloader;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.openqa.selenium.Cookie;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.JsonPathSelector;
import us.codecraft.webmagic.utils.HttpConstant;

import java.util.HashMap;
import java.util.List;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;

import static us.codecraft.webmagic.downloader.SeleniumTest.addRequestHeader;

/**
 *@Description
 *@Author Evan
 *@Date 2020/3/12 20:15
 */
public class LaGouProcessor implements PageProcessor {

	/**
	 * 总页数
	 */
	private AtomicInteger pageTotal = new AtomicInteger(30);

	/**
	 * 当前页号
	 */
	private AtomicInteger currentNo = new  AtomicInteger(1);

	/**
	 * 计数器
	 */
	private AtomicInteger record = new  AtomicInteger(0);

	/**
	 * showid 每次分页请求后,都可能会变化,因此要记录。
	 */
	private volatile String showId = "";

	/**
	 * 的反爬机制 XHHTP-COOKIE
	 */
	private volatile List<org.apache.http.cookie.Cookie> XHTTP_COOKIES;

	Site site;

	public LaGouProcessor(Site site) {
		this.site = site;
	}

	@Override
	public void process(Page page) {
		handlePage(page);

	}

	@Override
	public Site getSite() {
		return site;
	}

	/**
	 * 处理page页面
	 * @param page
	 */
	private void handlePage (Page page) {
		System.out.println("抓取到的数据:"+page.getRawText());
		List<String> list = new JsonPathSelector(("$.content.positionResult.result")).selectList(page.getRawText());

		String totalCount = new JsonPathSelector(("$.content.positionResult.totalCount")).select(page.getRawText());
		//		下一次分页查询的参数 sid:ef76aba27ec44eba80c54131f4b626af
		showId = new JsonPathSelector(("$.content.showId")).select(page.getRawText());
		int total = Integer.parseInt(totalCount);

		if (list!=null &&!list.isEmpty()) {
			JSONArray array = (JSONArray) JSONArray.parse(list.toString());
			StringBuilder builder = new StringBuilder();
			for (int i =0 ;i<array.size();i++) {
				Object buk = array.get(i);
				Object positionName = ((JSONObject) buk).get("positionName");
				Object companyFullName = ((JSONObject) buk).get("companyFullName");
				Object companyShortName = ((JSONObject) buk).get("companyShortName");
				Object companySize = ((JSONObject) buk).get("companySize");
				Object industryField = ((JSONObject) buk).get("industryField");
				Object financeStage = ((JSONObject) buk).get("financeStage");
				Object firstType = ((JSONObject) buk).get("firstType");
				Object secondType = ((JSONObject) buk).get("secondType");
				Object city = ((JSONObject) buk).get("city");
				Object district = ((JSONObject) buk).get("district");
				Object businessZones = ((JSONObject) buk).get("businessZones");
				Object salary = ((JSONObject) buk).get("salary");
				Object jobNature = ((JSONObject) buk).get("jobNature");
				Object education = ((JSONObject) buk).get("education");
				Object positionAdvantage = ((JSONObject) buk).get("positionAdvantage");
				builder.append(positionName)
						.append(companyFullName).append("	")
						.append(companyShortName).append("	")
						.append(companySize).append("	")
						.append(industryField).append("	")
						.append(financeStage).append("	")
						.append(firstType).append("	")
						.append(secondType).append("	")
						.append(city).append("	")
						.append(district).append("	")
						.append(businessZones).append("	")
						.append(salary).append("	")
						.append(jobNature).append("	")
						.append(education).append("	")
						.append(positionAdvantage)
						.append("\r\n");
				page.putField("No."+record.incrementAndGet(),builder.toString());
				builder.delete(0,builder.length());
			}
		}

		if (currentNo.get()<pageTotal.get()) {
			Request request = new Request("https://www.lagou.com/jobs/positionAjax.json?px=default&city=广州&needAddtionalResult=false");

			HashMap<String, Object> params = new HashMap<String, Object>();
			params.put("first","fasle");
			params.put("pn",currentNo.incrementAndGet());
			params.put("kd","java");
			params.put("sid",showId);
			request.setExtras(params);
			request.setMethod(HttpConstant.Method.POST);
			HttpRequestBody body = HttpRequestBody.form(params, "utf-8");
			request.setRequestBody(body);

			Set<Cookie> cookies = SeleniumTest.cookieStore;


			if (cookies!=null && cookies.size()>0) {
				for (Cookie cookie : cookies) {
					String cookieName = cookie.getName();
					String cookieValue = cookie.getValue();
					request.addCookie(cookieName,cookieValue);
				}

				if (page.getCookies().size()>0) {
					XHTTP_COOKIES = page.getCookies();
				}

				if (XHTTP_COOKIES!=null && XHTTP_COOKIES.size()>0) {
					for (org.apache.http.cookie.Cookie cookie :XHTTP_COOKIES) {
						System.out.println("======防止被识别添加cookie "+cookie.getName()+" "+cookie.getValue());
						String cookieName = cookie.getName();
						String cookieValue = cookie.getValue();
						request.addCookie(cookieName.trim(),cookieValue.trim());
					}
				}
				addRequestHeader(request);
			}
			try {
				Thread.sleep(20);
			} catch (InterruptedException e) {
				e.printStackTrace();
			}
			System.out.println("添加第"+currentNo.get()+"页数据请求任务"+params);
			page.addTargetRequest(request);
		}
	}

}

package us.codecraft.webmagic.downloader;

import org.junit.Ignore;
import org.junit.Test;
import org.openqa.selenium.By;
import org.openqa.selenium.Cookie;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.remote.DesiredCapabilities;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.model.HttpRequestBody;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.utils.HttpConstant;

import java.io.File;
import java.util.*;


public class SeleniumTest {

	public static Set<Cookie> cookieStore;

    @Ignore("need chrome driver")
    @Test
    public void testSelenium() {
        System.getProperties().setProperty("webdriver.chrome.driver", "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe");
        Map<String, Object> contentSettings = new HashMap<String, Object>();
        contentSettings.put("images", 2);

        Map<String, Object> preferences = new HashMap<String, Object>();
        preferences.put("profile.default_content_settings", contentSettings);

        DesiredCapabilities caps = DesiredCapabilities.chrome();
        caps.setCapability("chrome.prefs", preferences);
        caps.setCapability("chrome.switches", Arrays.asList("--user-data-dir=F:\\data\\webmagic\\chrome"));
        WebDriver webDriver = new ChromeDriver(caps);
        webDriver.get("https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=?labelWords=hot");

		WebDriver.Options manage = webDriver.manage();
		Set<Cookie> cookies = manage.getCookies();
		System.out.println("======cookies====="+cookies.toString());

//		WebElement webElement = webDriver.findElement(By.xpath("/html"));
		WebElement webElement = webDriver.findElement(By.xpath("/html/body/div[9]/div/div[2]"));
		System.out.println(webElement.getAttribute("outerHTML"));
		webElement.click();
        webDriver.close();



		Site site = Site.me()
				.setRetryTimes(3)
				.setSleepTime(5000)
				.setCharset("UTF-8");

		Request request = new Request("https://www.lagou.com/jobs/positionAjax.json?px=default&city=广州&needAddtionalResult=false");
		Map<String, Object> params = new HashMap<String, Object>();
		params.put("first","true");
		params.put("pn",1);
		params.put("kd","java");
		request.setExtras(params);
		request.setMethod(HttpConstant.Method.POST);
		HttpRequestBody body = HttpRequestBody.form(params, "utf-8");
		request.setRequestBody(body);

		//        获取cookie 请求职位列表接口
		if (cookies!=null && cookies.size()>0) {
			cookieStore = cookies;
			for (Cookie cookie : cookies) {
				String cookieName = cookie.getName();
				String cookieValue = cookie.getValue();
				System.out.println(String.format("cookieName:%s cookieValue:%s",cookieName,cookieValue));
				request.addCookie(cookieName,cookieValue);
			}

			addRequestHeader(request);
		}
//		启动爬虫
		System.out.println("启动爬虫....");
		File file = new File("");
		Spider.create(new LaGouProcessor(site))
				.addRequest(request)
				.addPipeline(new FilePipeline(file.getAbsolutePath()+File.separator+"拉勾网职位信息"))
				.thread(1) //防止被反爬机制识破,这里只用了一个线程爬取
				.run();
    }

    public static void addRequestHeader(Request request) {
		request.addHeader("Accept","application/json, text/javascript, */*; q=0.01");
		request.addHeader("Accept-Language","zh-CN,zh;q=0.9");
		request.addHeader("Accept-Encoding","gzip, deflate, br");
		request.addHeader("Content-Type","application/x-www-form-urlencoded; charset=UTF-8");
		request.addHeader("Cache-Contro","no-cache");
		request.addHeader("Host","www.lagou.com");
		request.addHeader("Origin","https://www.lagou.com");
		request.addHeader("Pragma","no-cache");
		request.addHeader("Referer","https://www.lagou.com/jobs/list_java?labelWords=&fromSearch=true&suginput=?labelWords=hot");
		request.addHeader("Sec-Fetch-Dest","empty");
		request.addHeader("ec-Fetch-Mode","cors");
		request.addHeader("Sec-Fetch-Site","same-origin");
		request.addHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36");
		request.addHeader("X-Anit-Forge-Code","0");
		request.addHeader("X-Anit-Forge-Token","None");
		request.addHeader("X-Requested-With","XMLHttpRequest");
	}
}

太懒了,不想写太多字,搞了两三天,累死了,随便记录一下
破解过程记录:

  • 用Selenium打开搜索页面,拿到初始cookie值
  • 每次分页查询的showId是下一次表单的sid
  • 连续请求接口后,会出现XHTTP-TOKEN,下次请求分页接口这个参数需要request cookie

你可能感兴趣的:(爬虫,webmagic,爬虫)