var cheerio = require('cheerio');
var iconv = require('iconv-lite');
var https= require('https');
var mysql = require('mysql');
var limit = 10;
var offset = 0;
var question_url = 'https://www.zhihu.com';
var base_url1 = 'https://www.zhihu.com/api/v4';
var base_url2 = '/feeds/essence?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&';
// 请求头
const userAgents = [
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) ,Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre',
'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
'Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
'Opera/9.25 (Windows NT 5.1; U; en), Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
];
var options = {
//proxy:'http://183.166.111.42:9999',
method: 'GET',
headers: {
"User-Agent": userAgents[parseInt(Math.random() * userAgents.length)],
// "cookie": '__cfduid=dd8936888a6f1979387fac18b0e7cb1781563936793; PHPSESSID=g66j025p81urea4b35ku32alv4; Hm_lvt_2d527f7b5c8cdde7b45e368a84b53fe8=1564483203,1564484204,1564484208,1564484213; Hm_lpvt_2d527f7b5c8cdde7b45e368a84b53fe8=1565190149'
}
};
// 创建连接
var connection = mysql.createConnection({
host : 'localhost',
user : 'root',
password : '12345678',
database : 'testmysql'
});
// 获得的问题id和回答id
var result = [];
// promise
var request_topic = function(uuu){
return new Promise(function (resolve, reject) {
options.headers["User-Agent"] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
setTimeout(()=>{reject(0)}, 5000);
https.get(uuu, options, (res) => {
var length=0;
var arr=[];
var result = [];
res.on("data",function(chunk){
arr.push(chunk);
length+=chunk.length;
});
res.on('end', ()=>{
var con_url = [];
var data=Buffer.concat(arr,length);
var change_data = iconv.decode(data,'utf-8');
// var $ = cheerio.load(change_data .toString());
// console.log(change_data)
try {
var topic = JSON.parse(change_data);
if(topic.paging.is_end == false){
resolve(topic.data);
}else {
reject('end');
}
// resolve(topic.data)
}catch (e) {
reject(1);
}
})
}).on('error', (e)=>{
reject(0);
});
})
};
var kindex = 0;
var cateindex = 0;
// var iferror = 0;
function request_(off, cateindex){
var sql = 'SELECT * from zhihuClass limit ' + cateindex + ',1 ';
//查
connection.query(sql,function (err, result) {
if (err) {
console.log('[SELECT ERROR] - ', err.message);
connection.end();
return;
} else {
if(result.length !== 0){
console.log(off, cateindex, result[0].name);
var uuu = base_url1 + result[0].class_id.slice(0, 6) + 's' + result[0].class_id.slice(6) + base_url2 + 'limit=' + limit + '&offset=' + off;
// console.log(uuu)
// off = off + 10;
request_topic(uuu).then(function (data) {
// console.log(data)
// iferror = 0;
if(data != undefined){
if(data.length !== 0){
// if(off < 3){
for (let item of data){
// result.push({'title': item.target.question.title, 'question_id': item.target.question.id, 'answer_id': item.target.id})
if(item.target.question != undefined){
var addSql = 'INSERT INTO zhihuTable(Id, questionId, answerId, title, url) VALUES(?,?,?,?,?)';
var addSqlParams = [kindex, item.target.question.id, item.target.id, item.target.question.title, question_url + '/question/' + item.target.question.id + '/answer/' + item.target.id];
kindex++;
// 这个连接也是异步的
connection.query(addSql, addSqlParams,function (err, result) {
if(err){
console.log('[INSERT ERROR] - ',err.message);
return;
}else {
// console.log('success');
// kindex++;
}
});
// off++;
}
}
off = off + 10;
if(off%100 === 0){
setTimeout(()=>{
request_(off, cateindex)
}, 3000);
}else {
request_(off, cateindex);
}
}else{
console.log('error ', 'data=0');
off = off + 10;
request_(off, cateindex);
var addSql = 'INSERT INTO zhihuTable(Id, questionId, answerId, title, url) VALUES(?,?,?,?,?)';
var k = 0;
for (let item of result){
var addSqlParams = [k, item.question_id, item.answer_id, item.title, question_url + '/question/' + item.question_id + '/answer/' + item.answer_id];
k++;
connection.query(addSql, addSqlParams,function (err, result) {
if(err){
console.log('[INSERT ERROR] - ',err.message);
return;
}else {
console.log('success');
}
});
}
// connection.end();
}
}else {
console.log('error ', 'undefined');
off+=10;
request_(off, cateindex);
// connection.end();
}
}, function (data) {
if(data === 0){
request_(off, cateindex);
}else{
if (data === 1){
console.log('error ', 1);
off += 10;
request_(off, cateindex);
} else{
console.log('end ', result[0].name);
cateindex++;
if(cateindex >= toplist_len){
connection.end();
}else {
off = 0;
request_(off, cateindex);
}
}
}
// console.log('err', data);
// iferror++;
// if(iferror > 10){
// off+=10;
// }
// request_(off);
})
}
}
})
}
// 开始
// 数据库连接
connection.connect();
request_(0, 0);
1、主要是逻辑上的问题,js这种异步语言非常适合高并发的场景,但在逻辑上就会有各种问题。比如请求过程中,我想让上一个请求完成后再次进行下一个请求,如果使用原先的异步过程,是达不到效果的。所以在这里,我使用了promise,在promise的then函数里递归调用自身。