http-spider(爬虫)

网络爬虫

1.概念:

通过后端语言爬取网站中的数据,然后通过特定模块进行数据清洗,最后将数据输出给前端


2.案例

const http=require('http');//导入http模块

const cheerio = require( 'cheerio' )

 <!--option是从nodejs官网复制的-->
const options = {
  hostname: '  域名  ',//域名
  port: 80,//http80;https443
  path: '路径',//路径
  method: 'GET',//请求方式
  headers: {//从要爬的网页的network中的xhr或doc文件中的request Headers复制过来并修改格式
    Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
    'Cache-Control':' no-cache',
    Cookie: 'PHPSESSID=ST-22290-Uo8KnobsTgDO-TrQvhjA4TfoJI4-izm5ejd5j1npj2pjc7i3v4z',
    Host: ' ',
    Pragma: 'no-cache',
    'Proxy-Connection': 'keep-alive',
    Referer: 'http://jx.1000phone.net/teacher.php/Class/index',
    'Upgrade-Insecure-Requests': 1,
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
    'Content-Type': 'application/x-www-form-urlencoded',
    'Content-Length': 0
  }
};

const req = http.get( options, (res) => {
  const { statusCode } = res;  // 获取状态码  1xx - 5xx
  const contentType = res.headers['content-type']; // 文件类型  text/json/html/xml

  res.setEncoding('utf8'); // 字符编码 

  // 核心 -- start
  let rawData = '';
  res.on('data', (chunk) => { rawData += chunk; }); // 数据拼接 
  res.on('end', () => { // 数据获取结束
    try {

      const $ = cheerio.load( rawData )

      $('td.student a').each( function ( item ) {
        console.log( $( this ).text() )
      })

    } catch (e) {
      console.error(e.message);
    }
  });

  // 核心  -- end
}).on('error', (e) => {
  console.error(`Got error: ${e.message}`);
});


req.end()

你可能感兴趣的:(http-spider(爬虫))