java 利用httpclient绕过百度验证登录爬取百度指数

公司要求爬取百度指数的相关信息,发现需要百度登录验证。网上找了很多相关文章,都是用来模拟登录的过程,经过一番尝试,发现太复杂,最后失败。于是,换种方式,直接绕过登录。具体方式如下:

分析请求内容

我用的是google浏览器的调试工具,按F12查看请求,选中XHR过滤信息
java 利用httpclient绕过百度验证登录爬取百度指数_第1张图片

java 利用httpclient绕过百度验证登录爬取百度指数_第2张图片
将上图中的信息全部设置到下面代码中的setHeaders()方法中去即可绕过登录,最关键的是Cookie信息。
目录结构:
java 利用httpclient绕过百度验证登录爬取百度指数_第3张图片

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;

import com.alibaba.fastjson.JSON;

public class Baiduzhishu {

	private static Properties pro = null;
	static CloseableHttpClient httpClient = HttpClients.createDefault();
	static {
		if (pro == null) {
			pro = PropertiesUtil.getConfig();
		}
	}

	public static void main(String[] args) {
		try {
			testAuto();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	public static void testAuto() {
		try {

			String filename = pro.getProperty("filepath");
			int interval = Integer.parseInt(pro.getProperty("interval"));
			File file = new File(filename);

			List<String[]> list = readFileAsList();
			System.out.println("总共有:" + list.size() + "条记录。");
			if (list.size() == 0) {
				System.out.println("全部爬取完成。。。");
				return;
			}
			int pageSize = 30;
			for (int count = 0; count < Math.ceil(list.size() / pageSize); count++) {
				List<String[]> newList = list.subList(count * pageSize, Math.min(list.size(), (count + 1) * pageSize));
				List<String> bufferList = new ArrayList<String>();
				for (int i = 0; i < Math.min(list.size(), pageSize); i++) {
					String text = newList.get(i)[0];
					String sousuo = getSousuo(text);
					String zixun = getZixun(text);
					String meiti = getMeiti(text);

					Thread.sleep(interval);
					StringBuilder line = new StringBuilder();
					line.append(newList.get(i)[0]).append("|").append(newList.get(i)[1]).append("|").append(sousuo)
							.append("|").append(zixun).append("|").append(meiti).append("|")
							.append(DateUtil.getNowTime());
					bufferList.add(line.toString());
				}
				if (bufferList.size() > 0) {
					writeFile(bufferList, file);
				}
				System.out.println("写入了" + ((count + 1) * pageSize) + "条数据");
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	public static String getHtml(String url) {
		String result = null;
		try {
			HttpGet get = new HttpGet(url);
			setHeaders(get);
			CloseableHttpResponse res = httpClient.execute(get);
			result = EntityUtils.toString(res.getEntity());
			System.out.println(result);
			return result;
		} catch (Exception e) {
			e.printStackTrace();
		}
		return result;
	}

	public static void setHeaders(HttpGet get) {
		get.setHeader("Accept",
				"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
		get.setHeader("Accept-Encoding", "gzip, deflate");
		get.setHeader("Cache-Control", "max-age=0");

		get.setHeader("Connection", "keep-alive");
		get.setHeader("Cookie", "PSTM=" + System.currentTimeMillis()+ ";"+pro.getProperty("cookie"));//自己登录的cookie信息
		get.setHeader("Host", "index.baidu.com");
		get.setHeader("Upgrade-Insecure-Requests", "1");
		get.setHeader("User-Agent",
				"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36");
	}

	public static String getSousuo(String keyword) {
		String sousuo = "";
		try {
			String result = getHtml("http://index.baidu.com/api/SearchApi/index?word=" + keyword + "&area=0&days=30");
			if (JSON.parseObject(result).getJSONObject("data") != null) {
				sousuo = JSON.parseObject(result).getJSONObject("data").getJSONArray("generalRatio").getJSONObject(0)
						.getJSONObject("all").getString("avg");
			}
		} catch (Exception e) {
			e.printStackTrace();
		}

		return sousuo;
	}

	public static String getZixun(String keyword) {
		String zixun = "";
		try {
			String result = getHtml(
					"http://index.baidu.com/api/FeedSearchApi/getFeedIndex?word=" + keyword + "&area=0&days=30");
			if (JSON.parseObject(result).getJSONObject("data") != null) {
				zixun = JSON.parseObject(result).getJSONObject("data").getJSONArray("index").getJSONObject(0)
						.getJSONObject("generalRatio").getString("avg");
			}
		} catch (Exception e) {
			e.printStackTrace();
		}

		return zixun;
	}

	public static String getMeiti(String keyword) {
		String meiti = "";
		try {
			String result = getHtml(
					"http://index.baidu.com/api/FeedSearchApi/getFeedIndex?word=" + keyword + "&area=0&days=30");
			if (JSON.parseObject(result).getJSONObject("data") != null) {
				meiti = JSON.parseObject(result).getJSONObject("data").getJSONArray("index").getJSONObject(0)
						.getJSONObject("generalRatio").getString("avg");
			}
		} catch (Exception e) {
			e.printStackTrace();
		}

		return meiti;
	}

	public static List<String[]> readFileAsList() throws IOException {
		List<String> finishedList = readAsFinishedList();
		System.out.println("已经完成的条数:" + finishedList.size());
		List<String[]> list = new ArrayList<String[]>();
		File file = new File(pro.getProperty("driverpath") + "/keywords.csv");
		try (BufferedReader br = new BufferedReader(new FileReader(file))) {
			while (br.ready()) {
				String line = br.readLine();
				line = new String(line.getBytes(), "UTF-8");
				String[] ts = line.split(",");
				if (!line.contains("\"")&&ts.length == 2 && (!finishedList.contains(line))) {
					list.add(ts);
				}
			}
		}
		return list;
	}

	public static List<String> readAsFinishedList() throws IOException {
		List<String> list = new ArrayList<String>();
		File file = new File(pro.getProperty("driverpath") + "/myfile.txt");
		try (BufferedReader br = new BufferedReader(new FileReader(file))) {
			while (br.ready()) {
				String line = br.readLine();
				line = new String(line.getBytes(), "UTF-8");
				String content = line.split("\\|")[0] + "," + line.split("\\|")[1];
				list.add(content);
			}
		}
		return list;
	}

	public static void writeFile(List<String> list, File file) {
		BufferedWriter out = null;
		FileWriter writer = null;
		try {
			writer = new FileWriter(file, true);
			out = new BufferedWriter(writer);
			for (int i = 0; i < list.size(); i++) {
				out.append(new String(list.get(i).getBytes(), "UTF-8"));
				out.newLine();
			}
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				if (out != null) {
					out.close();
				}
				if (writer != null) {
					writer.close();
				}
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}

工具类(配置文件自己写一个,这里就不贴了。)

import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;

public class PropertiesUtil {
	
	public static Properties getConfig() {
		InputStream in = PropertiesUtil.class.getClassLoader().getResourceAsStream("jdbc.properties");
		Properties p = new Properties();
		try {
			p.load(in);
		} catch (IOException e) {
			e.printStackTrace();
		}
		return p;
	}
}

需要的依赖包:

		<dependency>
			<groupId>org.apache.httpcomponents</groupId>
			<artifactId>httpclient</artifactId>
			<version>4.5.7</version>
		</dependency>
		<dependency>
			<groupId>org.apache.httpcomponents</groupId>
			<artifactId>httpcore</artifactId>
			<version>4.4.11</version>
		</dependency>
		<dependency>
			<groupId>com.alibaba</groupId>
			<artifactId>fastjson</artifactId>
			<version>1.2.53</version>
		</dependency>
		<dependency>
			<groupId>org.apache.commons</groupId>
			<artifactId>commons-lang3</artifactId>
			<version>3.7</version>
		</dependency>
		

这是第一篇文章,有不足之处欢迎留言指正^ _ ^

你可能感兴趣的:(java 利用httpclient绕过百度验证登录爬取百度指数)