Python爬虫、数据清洗与可视化-2 -request-练习-1-——爬baidu首页热点

练习:爬百度实时热点top10的排序,标题内容,链接

# -*- coding: UTF-8 -*-
import requests
from lxml import etree
import csv
#1.爬取
urls = "http://top.baidu.com/buzz?b=1"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"}
res = requests.get(url = urls,headers=headers)
res.encoding = 'gb2312'
html=etree.HTML(res.text) 
#response  = res.content.decode('gb2312')推荐用这种。
#html = etree.HTML(response)
cons = html.xpath('//table[@class="list-table"]//tr')
#2.数据提取
paiming = []
guanjianci = []
lianjie = []
sousuozhishu = []
for item in cons:
	pm = item.xpath('td[@class="first"]/span/text()')
	if pm:#判断是否是空列表
		paiming.append(pm[0])#pm[0]取pm列表第一个值,如果没有【】就会将列表写入,会有一个[]
	gjc = item.xpath('td[@class="keyword"]/a[1]/text()')
	if gjc:#判断是否是空列表
		guanjianci.append(gjc[0])
	lj = item.xpath('td[@class="keyword"]/a/@href')
	if lj:#判断是否是空列表
		lianjie.append(lj[0])
	sszs = item.xpath('td[@class="last"]/span/text()')
	if sszs:#判断是否是空列表
		sousuozhishu.append(sszs[0])
print(guanjianci)
#3.存储写入cvs文档
with open('hot.csv','w') as h: #打开当前路径下的名字为'hot.csv'的文件,如果不存在则创建它,返回myFile文件对象。
	h_csv = csv.writer(h)  #csv.writer(h)返回writer对象h_csv。
	h_csv.writerow(['排名','关键词','链接','搜索指数']) #writerow()方法是一行一行写入,writerows方法是一次写入多行。
	for i in range(20):
		rows = [[paiming[i],guanjianci[i],lianjie[i],sousuozhishu[i]]]
		h_csv.writerows(rows) #writerows方法是一次写入多行。

总结:
爬虫学习过程:
1.爬取方法:requests,urllib.request,get/post等爬取;
2.数据提取:xpath,beautifulsoup,css
3.爬虫框架:scrapy,selenium
4.文件存储:csv,mongo,mysql
5.数据清洗:pandas
6.数据可视化:matplotlib

学完scrapy在仔细看看下面的写法

import requests
from lxml import etree
import time
import json


class Item:
    id = None  # id
    title = None  # 标题
    url = None  # 链接
    hits = None  # 点击量


class GetBaiduHotSearch:
    def get_html(self, url):
        try:
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
            }
            response = requests.get(url=url, headers=headers)
            if response.status_code == 200:
                response.encoding='gb2312' 
                return response.text
            return None
        except Exception:
            return None

    def get_content(self, html):
        items = []
        # normalize-space 去空格,换行符
        content = etree.HTML(html)
        all_list = content.xpath('//table[@class="list-table"]//tr')[1:14] 
        # 初始id
        id = 0
        for i in all_list:
            item = Item()
            id += 1  # 自增1
            item.id = id
            item.rank_list = i.xpath('td[@class="first"]//text()')
            item.content_list = i.xpath('td[@class="keyword"]//a/text()')
            item.href_list = i.xpath('td[@class="first"]//text()')
            items.append(item)
        return items

    def write_to_txt(self, items):
        content_dict = {
            'id': None,
            'rank_list': None,
            'content_list': None,
            'href_list': None,
        }
        # 写入到文件中
        with open('result.txt', 'a', encoding='utf-8') as f:
            for item in items:
                content_dict['id'] = item.id
                content_dict['rank_list'] = item.rank_list
                content_dict['content_list'] = item.content_list
                content_dict['href_list'] = item.href_list
                print(content_dict)
                f.write(json.dumps(content_dict, ensure_ascii=False) + '\n')

    def main(self):
        url = 'http://top.baidu.com/buzz?b=1'
        html = self.get_html(url)
        items = self.get_content(html)
        self.write_to_txt(items)


if __name__ == '__main__':
    st = GetBaiduHotSearch().main()

你可能感兴趣的:(python项目练习记录,python)