公司要求爬取百度指数的相关信息,发现需要百度登录验证。网上找了很多相关文章,都是用来模拟登录的过程,经过一番尝试,发现太复杂,最后失败。于是,换种方式,直接绕过登录。具体方式如下:
我用的是google浏览器的调试工具,按F12查看请求,选中XHR过滤信息
将上图中的信息全部设置到下面代码中的setHeaders()方法中去即可绕过登录,最关键的是Cookie信息。
目录结构:
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import com.alibaba.fastjson.JSON;
public class Baiduzhishu {
private static Properties pro = null;
static CloseableHttpClient httpClient = HttpClients.createDefault();
static {
if (pro == null) {
pro = PropertiesUtil.getConfig();
}
}
public static void main(String[] args) {
try {
testAuto();
} catch (Exception e) {
e.printStackTrace();
}
}
public static void testAuto() {
try {
String filename = pro.getProperty("filepath");
int interval = Integer.parseInt(pro.getProperty("interval"));
File file = new File(filename);
List<String[]> list = readFileAsList();
System.out.println("总共有:" + list.size() + "条记录。");
if (list.size() == 0) {
System.out.println("全部爬取完成。。。");
return;
}
int pageSize = 30;
for (int count = 0; count < Math.ceil(list.size() / pageSize); count++) {
List<String[]> newList = list.subList(count * pageSize, Math.min(list.size(), (count + 1) * pageSize));
List<String> bufferList = new ArrayList<String>();
for (int i = 0; i < Math.min(list.size(), pageSize); i++) {
String text = newList.get(i)[0];
String sousuo = getSousuo(text);
String zixun = getZixun(text);
String meiti = getMeiti(text);
Thread.sleep(interval);
StringBuilder line = new StringBuilder();
line.append(newList.get(i)[0]).append("|").append(newList.get(i)[1]).append("|").append(sousuo)
.append("|").append(zixun).append("|").append(meiti).append("|")
.append(DateUtil.getNowTime());
bufferList.add(line.toString());
}
if (bufferList.size() > 0) {
writeFile(bufferList, file);
}
System.out.println("写入了" + ((count + 1) * pageSize) + "条数据");
}
} catch (Exception e) {
e.printStackTrace();
}
}
public static String getHtml(String url) {
String result = null;
try {
HttpGet get = new HttpGet(url);
setHeaders(get);
CloseableHttpResponse res = httpClient.execute(get);
result = EntityUtils.toString(res.getEntity());
System.out.println(result);
return result;
} catch (Exception e) {
e.printStackTrace();
}
return result;
}
public static void setHeaders(HttpGet get) {
get.setHeader("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3");
get.setHeader("Accept-Encoding", "gzip, deflate");
get.setHeader("Cache-Control", "max-age=0");
get.setHeader("Connection", "keep-alive");
get.setHeader("Cookie", "PSTM=" + System.currentTimeMillis()+ ";"+pro.getProperty("cookie"));//自己登录的cookie信息
get.setHeader("Host", "index.baidu.com");
get.setHeader("Upgrade-Insecure-Requests", "1");
get.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36");
}
public static String getSousuo(String keyword) {
String sousuo = "";
try {
String result = getHtml("http://index.baidu.com/api/SearchApi/index?word=" + keyword + "&area=0&days=30");
if (JSON.parseObject(result).getJSONObject("data") != null) {
sousuo = JSON.parseObject(result).getJSONObject("data").getJSONArray("generalRatio").getJSONObject(0)
.getJSONObject("all").getString("avg");
}
} catch (Exception e) {
e.printStackTrace();
}
return sousuo;
}
public static String getZixun(String keyword) {
String zixun = "";
try {
String result = getHtml(
"http://index.baidu.com/api/FeedSearchApi/getFeedIndex?word=" + keyword + "&area=0&days=30");
if (JSON.parseObject(result).getJSONObject("data") != null) {
zixun = JSON.parseObject(result).getJSONObject("data").getJSONArray("index").getJSONObject(0)
.getJSONObject("generalRatio").getString("avg");
}
} catch (Exception e) {
e.printStackTrace();
}
return zixun;
}
public static String getMeiti(String keyword) {
String meiti = "";
try {
String result = getHtml(
"http://index.baidu.com/api/FeedSearchApi/getFeedIndex?word=" + keyword + "&area=0&days=30");
if (JSON.parseObject(result).getJSONObject("data") != null) {
meiti = JSON.parseObject(result).getJSONObject("data").getJSONArray("index").getJSONObject(0)
.getJSONObject("generalRatio").getString("avg");
}
} catch (Exception e) {
e.printStackTrace();
}
return meiti;
}
public static List<String[]> readFileAsList() throws IOException {
List<String> finishedList = readAsFinishedList();
System.out.println("已经完成的条数:" + finishedList.size());
List<String[]> list = new ArrayList<String[]>();
File file = new File(pro.getProperty("driverpath") + "/keywords.csv");
try (BufferedReader br = new BufferedReader(new FileReader(file))) {
while (br.ready()) {
String line = br.readLine();
line = new String(line.getBytes(), "UTF-8");
String[] ts = line.split(",");
if (!line.contains("\"")&&ts.length == 2 && (!finishedList.contains(line))) {
list.add(ts);
}
}
}
return list;
}
public static List<String> readAsFinishedList() throws IOException {
List<String> list = new ArrayList<String>();
File file = new File(pro.getProperty("driverpath") + "/myfile.txt");
try (BufferedReader br = new BufferedReader(new FileReader(file))) {
while (br.ready()) {
String line = br.readLine();
line = new String(line.getBytes(), "UTF-8");
String content = line.split("\\|")[0] + "," + line.split("\\|")[1];
list.add(content);
}
}
return list;
}
public static void writeFile(List<String> list, File file) {
BufferedWriter out = null;
FileWriter writer = null;
try {
writer = new FileWriter(file, true);
out = new BufferedWriter(writer);
for (int i = 0; i < list.size(); i++) {
out.append(new String(list.get(i).getBytes(), "UTF-8"));
out.newLine();
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (out != null) {
out.close();
}
if (writer != null) {
writer.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
工具类(配置文件自己写一个,这里就不贴了。)
import java.io.IOException;
import java.io.InputStream;
import java.util.Properties;
public class PropertiesUtil {
public static Properties getConfig() {
InputStream in = PropertiesUtil.class.getClassLoader().getResourceAsStream("jdbc.properties");
Properties p = new Properties();
try {
p.load(in);
} catch (IOException e) {
e.printStackTrace();
}
return p;
}
}
需要的依赖包:
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.7</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.11</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.53</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.7</version>
</dependency>
这是第一篇文章,有不足之处欢迎留言指正^ _ ^