爬取豆瓣TOP250排行榜并用csv文件存储

import requests
from bs4 import BeautifulSoup
import csv

def getHtml(url):
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36’
}
try:
url = “https://movie.douban.com/top250”
r=requests.get(url,timeout=30,headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return “error connection”

def PaHtml(url):
page_list = []
bs = BeautifulSoup(url, “html.parser”)
lis = bs.find_all(".grid_view li")
for i in lis:
PaiMing = i.find_all(“item em”)[0].text
name = i.find_all(".hd a span")[0].text
info = i.find_all(".bd p")[0].text #信息
info = info.replace(" “, “”);info = info.replace(”\n", “”);info = info.replace("\xa0","")
PingFen = i.find_all(".bd .star .rating_num")[0].text
PingFenRengShu = i.find_all(".bd .star span")[-1].text
url = i.find_all(".hd a")[0].attrs[“href”]
mov_list = [PaiMing,name,info, PingFen, PingFenRengShu,url]
page_list.append(mov_list)
return page_list

def circle():
total_list = []
for i in range(0,250,25):
html = getHtml(“https://movie.douban.com/top250?start=”+str(i)+"&filter=")
page_list = PaHtml(html)
total_list = total_list + page_list
return total_list

def BaoCun(total_list):
Path = “D:\Python\PaChong”
file_path = Path + “\DouBan250.csv”
f = open(file_path, ‘w’, encoding=‘utf-8’, newline=’’)
csv_writer = csv.writer(f)
csv_writer.writerow([“排名”,“名字”,“评分”,“评分人数”,“信息”,“url”])
for i in total_list:
csv_writer.writerow([i[0], i[1], i[3], i[4], i[5], i[6], i[7]])
#csv_writer.writerow([“PaiMing”,“name”,“info”, “PingFen”, “PingFenRengShu”,“url”])

f.close()
print("文件存储完毕")

if name == “main”:
total_list = circle()
BaoCun(total_list)

你可能感兴趣的:(爬虫)