首先爬取话题广场所有话题。
var cheerio = require('cheerio');
var iconv = require('iconv-lite');
var https= require('https');
var mysql = require('mysql');
var querystring=require('querystring');
var topic_list = [{'name': '生活方式', 'ind': 19555513}, {'name': '经济学','ind': 19560170}, {'name': '运动', 'ind': 19552706}, {'name': '互联网', 'ind': 19550517}, {'name': '艺术', 'ind': 19550434}, {'name': '阅读', 'ind': 19550564}, {'name': '美食', 'ind': 19551137},
{'name': '动漫', 'ind': 19591985}, {'name': '汽车','ind': 19551915}, {'name': '足球', 'ind': 19559052}, {'name': '教育', 'ind': 19553176}, {'name': '摄影', 'ind': 19551388}, {'name': '历史', 'ind': 19551077}, {'name': '文化', 'ind': 19552266}, {'name': '旅行', 'ind': 19551556}, {'name': '职业发展', 'ind': 19554825}, {'name': '金融', 'ind': 19609455}, {'name': '游戏', 'ind': 19550994}, {'name': '篮球', 'ind': 19562832},
{'name': '生物学', 'ind': 19575492}, {'name': '物理学', 'ind': 19556950}, {'name': '化学', 'ind': 19562906}, {'name': '科技', 'ind': 19556664}, {'name': '体育', 'ind': 19554827}, {'name': '商业', 'ind': 19555457}, {'name': '健康', 'ind': 19550937}, {'name': '创业', 'ind': 19550560}, {'name': '设计', 'ind': 19551557}, {'name': '自然科学', 'ind': 19553298}, {'name': '法律', 'ind': 19550874}, {'name': '电影', 'ind': 19550429},
{'name': '音乐', 'ind': 19550453}, {'name': '投资', 'ind': 19551404}];
var toplist_len = topic_list.length;
var limit = 10;
var offset = 0;
var base_url = 'https://www.zhihu.com';
var base_url1 = 'https://www.zhihu.com/api/v4/topics/';
var base_url2 = '/feeds/essence?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&';
const userAgents = [
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) ,Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
'Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Opera/9.25 (Windows NT 5.1; U; en), Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
];
var options = {
// proxy:'http://183.166.111.42:9999',
method: 'GET',
headers: {
"User-Agent": userAgents[parseInt(Math.random() * userAgents.length)],
// "cookie": '__cfduid=dd8936888a6f1979387fac18b0e7cb1781563936793; PHPSESSID=g66j025p81urea4b35ku32alv4; Hm_lvt_2d527f7b5c8cdde7b45e368a84b53fe8=1564483203,1564484204,1564484208,1564484213; Hm_lpvt_2d527f7b5c8cdde7b45e368a84b53fe8=1565190149'
}
};
// 创建连接
var connection = mysql.createConnection({
host : 'localhost',
user : 'root',
password : '12345678',
database : 'testmysql'
});
// 获得的问题id和回答id
var result = [];
var url = base_url + '/topics#' + topic_list[0].name;
var cc = encodeURI(url);
var topic_class1 = [];
var topic_class2 = [];
function req_post(ind, offset) {
var id = topic_class1[ind].cate_id;
return new Promise(function (resolve, reject) {
//发送 http Post 请求
var postData=querystring.stringify({
method: 'next',
params: '{"topic_id": ' + id + ', "offset": ' + offset + ', "hash_id": "a88b36c207d708d8cb03049980e4645f"}'
});
var option={
hostname: 'www.zhihu.com',
path: '/node/TopicsPlazzaListV2',
method: 'POST',
headers:{
//'Content-Type':'application/x-www-form-urlencoded',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Content-Length':Buffer.byteLength(postData)
}
};
setTimeout(()=>{reject(0)}, 5000);
var req = https.request(option, function(res) {
// console.log('Status:',res.statusCode);
// console.log('headers:',JSON.stringify(res.headers));
var length=0;
var arr=[];
res.on("data",function(chunk){
arr.push(chunk);
length+=chunk.length;
});
res.on('end', ()=>{
var data=Buffer.concat(arr,length);
var change_data = iconv.decode(data,'utf-8');
var data = JSON.parse(change_data).msg;
if(data.length > 0){
for (var str of data){
var $ = cheerio.load(str);
topic_class2.push({'name': $('strong').text(), 'class_id': $('[target]').attr('href')});
}
offset = data.length + offset;
console.log(ind, id, offset)
resolve(offset);
}else{
resolve(0)
}
// var $ = cheerio.load(change_data.toString());
})
});
req.on('error',function(err){
reject(0);
});
req.write(postData);
req.end();
});
}
function req_id(ind, offset){
req_post(ind, offset).then(function (data) {
if(data === 0){
ind++;
if(ind < topic_class1.length){
req_id(ind, 0)
}else {
console.log(topic_class2)
connection.connect();
var Bindex = 0;
for (var items of topic_class2){
var addSql = 'INSERT INTO zhihuClass(Id, name, class_id) VALUES(?,?,?)';
var addSqlParams = [Bindex, items.name, items.class_id];
console.log(Bindex)
Bindex++;
// 这个连接也是异步的
connection.query(addSql, addSqlParams,function (err, result) {
if(err){
console.log('[INSERT ERROR] - ',err.message);
}else {
// console.log('success');
// kindex++;
}
});
}
}
}else {
req_id(ind, data);
}
}, function (data) {
req_id(ind, offset);
})
}
https.get('https://www.zhihu.com/topics', options, (res)=>{
var length=0;
var arr=[];
res.on("data",function(chunk){
arr.push(chunk);
length+=chunk.length;
});
res.on('end', ()=>{
var data=Buffer.concat(arr,length);
var change_data = iconv.decode(data,'utf-8');
// console.log(change_data);
var $ = cheerio.load(change_data.toString());
var cate = $('.zm-topic-cat-item');
cate.each((index, value)=>{
var name = $(value).text();
var cate_id = $(value).attr('data-id');
topic_class1.push({'name': name, 'cate_id': cate_id});
})
req_id(0, 0);
})
}).on('error', (e)=>{
});
1、在这里知乎有使用前端路由(url里#是使用hash方法的前端路由的的标志)。
2、User-Agent列表是为了反反爬虫(虽然用处不大,但聊胜于无);
3、post请求中使用proxy id代理,(使用IP代理池更好,我这里只是尝试一下,后来发现知乎就刚开始的时候封了一天的ip,后来我设置了请求间隔后,就没有了)。
4、cheerio是为了使用jq语法解析网页内容。
iconv-lite是改变编码格式。
querystring是将请求数据编码为url格式。