利用cheerio和request模块写的爬虫(写给自己,,这是别人写过的)

先去了解了解cheerio和rquest模块        

cheerio: https://www.npmjs.com/package/cheerio

request: https://www.npmjs.com/package/request


我用vue的webpack模板习惯了。所以还是在这上去npm这两个包


在src下新建splider.js文件用来写脚本代码

新建data目录用来存放爬到的数据

新建images文件夹用来存放图片


爬取的是北大软件与微电子学院新闻


1.引入依赖
const http = require('http')
const fs = require('fs')
const cheerio = require('cheerio')
const request = require('request')

let i = 0;
//初始url
const url = "http://www.ss.pku.edu.cn/index.php/newscenter/news/2391";
startRequest(url)

function startRequest(x) {
	http.get(x, function(res) {
		let html = '';
		let title = [];
		res.setEncoding('utf-8');
		res.on('data', function(chunk) {
			html += chunk;
		})
		res.on('end', function(){
			let $ = cheerio.load(html);
			let time = $('.article-info a:first-child').next().text().trim()
			let new_item = {
				title:$('div.article-title a').text().trim(),
				Time: time,
				link: "http://www.ss.pku.edu.cn"+ $('div.article-title a').attr('href'),
				author:$('[title=供稿]').text().trim(),
				i:i++
			}
			console.log(new_item)
			let news_title = $('div.article-title a').text().trim();
			savedContent($, news_title)
			savedImg($, news_title)
			let nextLink = 'http://www.ss.pku.edu.cn'+ $('li.next a').attr('href')
			let str1 = nextLink.split('-')
			let str = encodeURI(str1[0])
			if(i<=500){
				fetchPage(str)
			}
		});
	}).on('error', function(err){
		console.log(err)
	})
}

function savedContent($, news_title) {
	$('.article-content p').each(function(index,item) {
		let x = $(this).text()
		let y = x.substring(0, 2).trim();
		if(y == ''){
			x= x+'\n'
			fs.appendFile('./data/'+ news_title + '.txt', x, 'utf-8',function(err){
				if(err){
					console.log(err)
				}
			})
		}
	})
}

function savedImg($, news_title) {
	$('.article-content img').each(function(index, item){
		let img_title = $(this).parent().next().text().trim()
		if(img_title.length>35||img_title==""){
         img_title="Null";}
        var img_filename = img_title + '.jpg';

        var img_src = 'http://www.ss.pku.edu.cn' + $(this).attr('src');
        request.head(img_src,function(err,res,body){
            if(err){
                console.log(err);
            }
        });
        request(img_src).pipe(fs.createWriteStream('./images/'+news_title + '---' + img_filename));
	})
	
}

你可能感兴趣的:(nodejs)