nodejs express搭建服务器(爬虫知乎精华帖,个人学习用)二 找到每一类话题中的精华帖的链接

var cheerio = require('cheerio');
var iconv = require('iconv-lite');
var https= require('https');
var mysql = require('mysql');

var limit = 10;
var offset = 0;
var question_url = 'https://www.zhihu.com';

var base_url1 = 'https://www.zhihu.com/api/v4';
var base_url2 = '/feeds/essence?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&';
// 请求头
const userAgents = [
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) ,Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre',
    'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
    'Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
    'Opera/9.25 (Windows NT 5.1; U; en), Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
];

var options = {
    //proxy:'http://183.166.111.42:9999',
    method: 'GET',
    headers: {
        "User-Agent": userAgents[parseInt(Math.random() * userAgents.length)],
        // "cookie": '__cfduid=dd8936888a6f1979387fac18b0e7cb1781563936793; PHPSESSID=g66j025p81urea4b35ku32alv4; Hm_lvt_2d527f7b5c8cdde7b45e368a84b53fe8=1564483203,1564484204,1564484208,1564484213; Hm_lpvt_2d527f7b5c8cdde7b45e368a84b53fe8=1565190149'
    }
};
// 创建连接
var connection = mysql.createConnection({
    host     : 'localhost',
    user     : 'root',
    password : '12345678',
    database : 'testmysql'
});

// 获得的问题id和回答id
var result = [];

// promise
var request_topic = function(uuu){
  return new Promise(function (resolve, reject) {
      options.headers["User-Agent"] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
      setTimeout(()=>{reject(0)}, 5000);
      https.get(uuu, options, (res) => {
          var length=0;
          var arr=[];
          var result = [];
          res.on("data",function(chunk){
              arr.push(chunk);
              length+=chunk.length;
          });
          res.on('end', ()=>{
              var con_url = [];
              var data=Buffer.concat(arr,length);
              var change_data = iconv.decode(data,'utf-8');
              // var $ = cheerio.load(change_data .toString());
              // console.log(change_data)
              try {
                  var topic = JSON.parse(change_data);
                  if(topic.paging.is_end == false){
                      resolve(topic.data);
                  }else {
                      reject('end');
                  }
                  // resolve(topic.data)
              }catch (e) {
                  reject(1);
              }
          })
      }).on('error', (e)=>{
          reject(0);
      });
  })
};

var kindex = 0;
var cateindex = 0;
// var iferror = 0;
function request_(off, cateindex){
    var  sql = 'SELECT * from zhihuClass limit ' + cateindex + ',1 ';
//查
    connection.query(sql,function (err, result) {
        if (err) {
            console.log('[SELECT ERROR] - ', err.message);
            connection.end();
            return;
        } else {
            if(result.length !== 0){
                console.log(off, cateindex, result[0].name);
                var uuu = base_url1 + result[0].class_id.slice(0, 6) + 's' + result[0].class_id.slice(6) + base_url2 + 'limit=' + limit + '&offset=' + off;
                // console.log(uuu)
                // off = off + 10;
                request_topic(uuu).then(function (data) {
                    // console.log(data)
                    // iferror = 0;
                    if(data != undefined){
                        if(data.length !== 0){
                            // if(off < 3){
                            for (let item of data){
                                // result.push({'title': item.target.question.title, 'question_id': item.target.question.id, 'answer_id': item.target.id})
                                if(item.target.question != undefined){
                                    var  addSql = 'INSERT INTO zhihuTable(Id, questionId, answerId, title, url) VALUES(?,?,?,?,?)';
                                    var  addSqlParams = [kindex, item.target.question.id, item.target.id, item.target.question.title, question_url + '/question/' + item.target.question.id + '/answer/' + item.target.id];
                                    kindex++;
                                    // 这个连接也是异步的
                                    connection.query(addSql, addSqlParams,function (err, result) {
                                        if(err){
                                            console.log('[INSERT ERROR] - ',err.message);
                                            return;
                                        }else {
                                            // console.log('success');
                                            // kindex++;
                                        }
                                    });
                                    // off++;
                                }
                            }
                            off = off + 10;
                            if(off%100 === 0){
                                setTimeout(()=>{
                                    request_(off, cateindex)
                                }, 3000);
                            }else {
                                request_(off, cateindex);
                            }
                        }else{
                            console.log('error ', 'data=0');
                            off = off + 10;
                            request_(off, cateindex);
                            var  addSql = 'INSERT INTO zhihuTable(Id, questionId, answerId, title, url) VALUES(?,?,?,?,?)';
                            var k = 0;
                            for (let item of result){
                                var  addSqlParams = [k, item.question_id, item.answer_id, item.title, question_url + '/question/' + item.question_id + '/answer/' + item.answer_id];
                                k++;
                                connection.query(addSql, addSqlParams,function (err, result) {
                                    if(err){
                                        console.log('[INSERT ERROR] - ',err.message);
                                        return;
                                    }else {
                                        console.log('success');
                                    }
                                });
                            }
                            // connection.end();
                        }
                    }else {
                        console.log('error ', 'undefined');
                        off+=10;
                        request_(off, cateindex);
                        // connection.end();
                    }
                }, function (data) {
                    if(data === 0){
                        request_(off, cateindex);
                    }else{
                        if (data === 1){
                            console.log('error ', 1);
                            off += 10;
                            request_(off, cateindex);
                        } else{
                            console.log('end ', result[0].name);
                            cateindex++;
                            if(cateindex >= toplist_len){
                                connection.end();
                            }else {
                                off = 0;
                                request_(off, cateindex);
                            }
                        }
                    }
                    // console.log('err', data);
                    // iferror++;
                    // if(iferror > 10){
                    //     off+=10;
                    // }
                    // request_(off);
                })
            }
        }
    })
}
// 开始
// 数据库连接
connection.connect();
request_(0, 0);



1、主要是逻辑上的问题,js这种异步语言非常适合高并发的场景,但在逻辑上就会有各种问题。比如请求过程中,我想让上一个请求完成后再次进行下一个请求,如果使用原先的异步过程,是达不到效果的。所以在这里,我使用了promise,在promise的then函数里递归调用自身。

你可能感兴趣的:(nodejs express搭建服务器(爬虫知乎精华帖,个人学习用)二 找到每一类话题中的精华帖的链接)