由于搜狗的反爬原因,不能直接对搜狗上面检索结果进行抓取,首先应先对操作进行抓包分析:
一、抓包分析:
1、搜狗微信公众号检索界面:
HTTP Referer是header的一部分,当浏览器向web服务器发送请求的时候,一般会带上Referer,告诉服务器我是从哪个页面链接过来的,服务器藉此可以获得一些信息用于处理。比如从我主页上链接到一个朋友那里,他的服务器就能够从HTTP Referer中统计出每天有多少用户点击我主页上的链接访问他的网站。
3、分析获取到的可访问地址:
http://weixin.sogou.com/weixin?type=1
&sug_type=
&query=%E6%B1%BD%E8%BD%A6+%E4%BA%91%E5%8D%97
&ie=utf8
&sug=y
&w=01019900
&sut=2257
&sst0=1475908133010 //当前时间戳(ms)
&lkt=0%2C0%2C0
二、数据抓取:
通过referer来请求,获取请求数据(对方估计会根据频率对本地模拟请求有所限制,如被限制,隔一段时间就会恢复)
请求地址参数:
private static final String OTHER_PARAMS = "&_sug_=y&w=01019900&sut=1790&lkt=0%2C0%2C0";
private static final String BASE_URL = "http://weixin.sogou.com/weixin?type=1&ie=utf8&_sug_type_=" + OTHER_PARAMS;
private static void start() {
String url = null;
String[] keyWords = {"汽车", "二手车", "车"};
System.out.println("--- start ---");
for (int i = 0; i < keyWords.length; i++) {
String keyWord = keyWords[i];
List citys = getCity();
for (int j = 0; j < citys.size(); j++) {
String city = citys.get(j);
String url_suffix = null;
try {
url_suffix = java.net.URLEncoder.encode(keyWord,"utf-8") + "+" + java.net.URLEncoder.encode(city,"utf-8");
url = BASE_URL + "&query=" + url_suffix + "&sst0=" + System.currentTimeMillis();
System.out.println("fetch : " + url);
fetch(url);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
}
}
private static List getCity() {
String[] citys = new String[]{"贵州","甘肃","广西","浙江","福建","安徽","香港","广东","海南","河北","河南","黑龙江","重庆","辽宁","湖北","湖南","吉林","江苏","江西","天津",
"内蒙古","四川","宁夏","青海","山东","山西","陕西","上海","西藏","北京","新疆","云南"};
List list = Arrays.asList(citys);
return list;
}
数据抓取并存储:
private static void fetch(String url) {
Document doc = ToolkitForSpider.getHtmlDoc(url);
Elements eles = doc.select("div div.txt-box");
for (int i = 0; i < eles.size(); i++) {
String name = eles.get(i).select("h3").text();
String code = eles.get(i).select(" h4 span label").text();
Elements urlEles = doc.select("div.wx-rb.bg-blue.wx-rb_v1._item");
String wechat_url = urlEles.get(i).attr("href");
System.out.println(name + " " + code + " " + wechat_url);
insert(name, code, wechat_url);
}
Elements pageEles = doc.select("div.p a");
int totalPage = pageEles.size();
for (int i = 0; i < totalPage - 1; i++) {
String pageUrl = pageEles.get(i).attr("href");
pageUrl = "http://weixin.sogou.com/weixin" + pageUrl;
System.out.println("fetch : " + pageUrl);
Document pageDoc = ToolkitForSpider.getHtmlDoc(pageUrl);
Elements pageUrlEles = pageDoc.select("div div.txt-box");
for (int j = 0; j < pageUrlEles.size(); j++) {
String name = pageUrlEles.get(j).select("h3").text();
String code = pageUrlEles.get(j).select(" h4 span label").text();
Elements urlEles = doc.select("div.wx-rb.bg-blue.wx-rb_v1._item");
String wechat_url = urlEles.get(j).attr("href");
System.out.println(name + " " + code);
insert(name, code, wechat_url);
}
}
}
private static void insert(String name, String code, String url) {
Connection conn = ToolkitForSpider.getMySqlConnection();
String sql = "insert into inf_weixin(wx_name, wx_code, biz_url, last_update, last_update_title, newrank_index, update_time) "
+ "values(?,?,?,?,?,?,now()) on duplicate key update update_time = now()";
Object[] values = new Object[]{name, code, url, "1970-01-01", "-1", "-1"};
try {
DBLib.update(conn, sql, values);
} catch (SQLException e) {
e.printStackTrace();
} finally {
ToolkitForSpider.close(conn);
}
}