nodejs express搭建服务器(爬虫知乎精华帖,个人学习用)一 爬取所有话题类型

首先爬取话题广场所有话题。

var cheerio = require('cheerio');
var iconv = require('iconv-lite');
var https= require('https');
var mysql = require('mysql');
var querystring=require('querystring');

var topic_list = [{'name': '生活方式', 'ind': 19555513}, {'name': '经济学','ind': 19560170}, {'name': '运动', 'ind': 19552706}, {'name': '互联网', 'ind': 19550517}, {'name': '艺术', 'ind': 19550434}, {'name': '阅读', 'ind': 19550564}, {'name': '美食', 'ind': 19551137},
    {'name': '动漫', 'ind': 19591985}, {'name': '汽车','ind': 19551915}, {'name': '足球', 'ind': 19559052}, {'name': '教育', 'ind': 19553176}, {'name': '摄影', 'ind': 19551388}, {'name': '历史', 'ind': 19551077}, {'name': '文化', 'ind': 19552266}, {'name': '旅行', 'ind': 19551556}, {'name': '职业发展', 'ind': 19554825}, {'name': '金融', 'ind': 19609455}, {'name': '游戏', 'ind': 19550994}, {'name': '篮球', 'ind': 19562832},
    {'name': '生物学', 'ind': 19575492}, {'name': '物理学', 'ind': 19556950}, {'name': '化学', 'ind': 19562906}, {'name': '科技', 'ind': 19556664}, {'name': '体育', 'ind': 19554827}, {'name': '商业', 'ind': 19555457}, {'name': '健康', 'ind': 19550937}, {'name': '创业', 'ind': 19550560}, {'name': '设计', 'ind': 19551557}, {'name': '自然科学', 'ind': 19553298}, {'name': '法律', 'ind': 19550874}, {'name': '电影', 'ind': 19550429},
    {'name': '音乐', 'ind': 19550453}, {'name': '投资', 'ind': 19551404}];
var toplist_len = topic_list.length;
var limit = 10;
var offset = 0;
var base_url = 'https://www.zhihu.com';

var base_url1 = 'https://www.zhihu.com/api/v4/topics/';
var base_url2 = '/feeds/essence?include=data%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.content%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Danswer%29%5D.target.is_normal%2Ccomment_count%2Cvoteup_count%2Ccontent%2Crelevant_info%2Cexcerpt.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Darticle%29%5D.target.content%2Cvoteup_count%2Ccomment_count%2Cvoting%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dtopic_sticky_module%29%5D.target.data%5B%3F%28target.type%3Dpeople%29%5D.target.answer_count%2Carticles_count%2Cgender%2Cfollower_count%2Cis_followed%2Cis_following%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%3Bdata%5B%3F%28target.type%3Danswer%29%5D.target.author.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Darticle%29%5D.target.annotation_detail%2Ccontent%2Chermes_label%2Cis_labeled%2Cauthor.badge%5B%3F%28type%3Dbest_answerer%29%5D.topics%3Bdata%5B%3F%28target.type%3Dquestion%29%5D.target.annotation_detail%2Ccomment_count%3B&';

const userAgents = [
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) ,Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre',
    'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
    'Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
    'Opera/9.25 (Windows NT 5.1; U; en), Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
];

var options = {
    // proxy:'http://183.166.111.42:9999',
    method: 'GET',
    headers: {
        "User-Agent": userAgents[parseInt(Math.random() * userAgents.length)],
        // "cookie": '__cfduid=dd8936888a6f1979387fac18b0e7cb1781563936793; PHPSESSID=g66j025p81urea4b35ku32alv4; Hm_lvt_2d527f7b5c8cdde7b45e368a84b53fe8=1564483203,1564484204,1564484208,1564484213; Hm_lpvt_2d527f7b5c8cdde7b45e368a84b53fe8=1565190149'
    }
};
// 创建连接
var connection = mysql.createConnection({
    host     : 'localhost',
    user     : 'root',
    password : '12345678',
    database : 'testmysql'
});

// 获得的问题id和回答id
var result = [];
var url = base_url + '/topics#' + topic_list[0].name;
var cc = encodeURI(url);

var topic_class1 = [];
var topic_class2 = [];

function req_post(ind, offset) {
    var id = topic_class1[ind].cate_id;
    return new Promise(function (resolve, reject) {
        //发送 http Post 请求
        var postData=querystring.stringify({
            method: 'next',
            params: '{"topic_id": ' + id + ', "offset": ' + offset + ', "hash_id": "a88b36c207d708d8cb03049980e4645f"}'
        });
        var option={
            hostname: 'www.zhihu.com',
            path: '/node/TopicsPlazzaListV2',
            method: 'POST',
            headers:{
                //'Content-Type':'application/x-www-form-urlencoded',
                'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
                'Content-Length':Buffer.byteLength(postData)
            }
        };
        setTimeout(()=>{reject(0)}, 5000);
        var req = https.request(option, function(res) {
            // console.log('Status:',res.statusCode);
            // console.log('headers:',JSON.stringify(res.headers));
            var length=0;
            var arr=[];
            res.on("data",function(chunk){
                arr.push(chunk);
                length+=chunk.length;
            });
            res.on('end', ()=>{
                var data=Buffer.concat(arr,length);
                var change_data = iconv.decode(data,'utf-8');
                var data = JSON.parse(change_data).msg;
                if(data.length > 0){
                    for (var str of data){
                        var $ = cheerio.load(str);
                        topic_class2.push({'name': $('strong').text(), 'class_id': $('[target]').attr('href')});
                    }
                    offset = data.length + offset;
                    console.log(ind, id, offset)
                    resolve(offset);
                }else{
                    resolve(0)
                }
                // var $ = cheerio.load(change_data.toString());
            })
        });
        req.on('error',function(err){
            reject(0);
        });
        req.write(postData);
        req.end();
    });
}

function req_id(ind, offset){
    req_post(ind, offset).then(function (data) {
        if(data === 0){
            ind++;
            if(ind < topic_class1.length){
                req_id(ind, 0)
            }else {
                console.log(topic_class2)
                connection.connect();
                var Bindex = 0;
                for (var items of topic_class2){
                    var  addSql = 'INSERT INTO zhihuClass(Id, name, class_id) VALUES(?,?,?)';
                    var  addSqlParams = [Bindex, items.name, items.class_id];
                    console.log(Bindex)
                    Bindex++;
                    // 这个连接也是异步的
                    connection.query(addSql, addSqlParams,function (err, result) {
                        if(err){
                            console.log('[INSERT ERROR] - ',err.message);
                        }else {
                            // console.log('success');
                            // kindex++;
                        }
                    });
                }
            }
        }else {
            req_id(ind, data);
        }
    }, function (data) {
        req_id(ind, offset);
    })

}

https.get('https://www.zhihu.com/topics', options, (res)=>{
    var length=0;
    var arr=[];
    res.on("data",function(chunk){
        arr.push(chunk);
        length+=chunk.length;
    });
    res.on('end', ()=>{
        var data=Buffer.concat(arr,length);
        var change_data = iconv.decode(data,'utf-8');
        // console.log(change_data);
        var $ = cheerio.load(change_data.toString());
        var cate = $('.zm-topic-cat-item');
        cate.each((index, value)=>{
            var name = $(value).text();
            var cate_id = $(value).attr('data-id');
            topic_class1.push({'name': name, 'cate_id': cate_id});
        })
        req_id(0, 0);
    })
}).on('error', (e)=>{

});



1、在这里知乎有使用前端路由(url里#是使用hash方法的前端路由的的标志)。
2、User-Agent列表是为了反反爬虫(虽然用处不大,但聊胜于无);
3、post请求中使用proxy id代理,(使用IP代理池更好,我这里只是尝试一下,后来发现知乎就刚开始的时候封了一天的ip,后来我设置了请求间隔后,就没有了)。
4、cheerio是为了使用jq语法解析网页内容。
iconv-lite是改变编码格式。
querystring是将请求数据编码为url格式。

你可能感兴趣的:(nodejs express搭建服务器(爬虫知乎精华帖,个人学习用)一 爬取所有话题类型)