1) 爬取豆瓣电影
import requests
import json
import pandas as pd
from bs4 import BeautifulSoup
import csv
import os
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36"
}
def getMovieUrl(start):
movie_url_list = []
for i in range(start, start+100, 20):
url = "https://movie.douban.com/j/search_subjects?type=movie&tag=%E5%8D%8E%E8%AF%AD&sort=recommend&page_limit=20&page_start=" + \
str(i)
response = requests.get(url, headers=headers)
data = response.text
data2 = json.loads(data)
movie_list = data2["subjects"]
for i in movie_list:
movie_url = i['url']
movie_url_list.append(movie_url)
return movie_url_list
def catchMovieImformation(urls):
movie_name_list = []
director_list = []
actor_list = []
year_list = []
score_list = []
evaluation_list = []
for movie_url in urls:
response = requests.get(movie_url, headers=headers)
data = response.text
soup = BeautifulSoup(data, "html.parser")
movie_name = soup.select("#content > h1 > span:nth-child(1)")
movie_name = movie_name[0]
movie_name = movie_name.get_text().replace("\n", "").replace(" ", "")
movie_name_list.append(movie_name)
director = soup.select(
"#info > span:nth-child(1) > span.attrs > a")
director = director[0]
director = director.get_text().replace("\n", "").replace(" ", "")
director_list.append(director)
actor = soup.select("#info > span.actor > span.attrs")
actor = actor[0]
actor = actor.get_text().replace("\n", "").replace(" ", "")
actors = actor.split("/")
if len(actors) >= 3:
actor_list.append({
"主演1": actors[0],
"主演2": actors[1],
"主演3": actors[2],
})
elif len(actors) == 2:
actor_list.append({
"主演1": actors[0],
"主演2": actors[1],
})
else:
actor_list.append({
"主演1": actors[0],
})
year = soup.select("#content > h1 > span.year")
year = year[0]
year = year.get_text().replace("\n", "").replace(
" ", "").replace("(", "").replace(")", "")
year_list.append(year)
score = soup.select(
"#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > strong")
score = score[0]
score = score.get_text().replace("\n", "").replace(" ", "")
score_list.append(score)
evaluation = soup.select(
"#interest_sectl > div.rating_wrap.clearbox > div.rating_self.clearfix > div > div.rating_sum > a > span")
evaluation = evaluation[0]
evaluation = evaluation.get_text().replace("\n", "").replace(" ", "")
evaluation_list.append(evaluation)
return movie_name_list, director_list, actor_list, year_list, score_list, evaluation
def writer(urls,file_path='./movices.csv'):
res = catchMovieImformation(urls)
data = {'电影名称': res[0],
'导演': res[1],
'主演(前三位)': res[2],
'上映时间': res[3],
'评分数': res[4],
'评价人数': res[5]
}
movices= pd.DataFrame(data)
movices.iloc[:,2] = movices.iloc[:,2].map(lambda x:x.values())
movices.to_csv(file_path, mode='a', header=False)
start = 0
urls = getMovieUrl(start)
writer(urls)
while len(urls):
writer(urls)
start += 100
time.sleep(3)
urls = getMovieUrl(start)
print(start)
movices = pd.read_csv('./movices.csv')
movices
2) 爬取笔趣网小说《赝太子》
'''
@Descripttion:爬取笔趣阁《赝太子》小说
@version: 1.0.0
@Author: blsm
@Date: 2020-05-28 09:42:46
'''
from bs4 import BeautifulSoup
import requests
import sys
class Downloader():
def __init__(self):
self.server = 'https://www.biqugexx.com'
self.target = 'https://www.biqugexx.com/118_118150/'
self.namelist = []
self.urls = []
self.nums = 0
def get_download_url(self):
"""
获取列表链接
"""
req = requests.get(self.target)
html = req.text
bf = BeautifulSoup(html)
list_bf = bf.find_all('div', id='list')
a_bf = BeautifulSoup(str(list_bf[0]))
a = a_bf.find_all('a')
self.nums = len(a[12:])
for each in a[12:]:
self.namelist.append(each.string)
self.urls.append(self.server+each.get('href'))
def get_contents(self, target):
"""
获取章节内容
Arguments:
target {[type]} -- 下载链接
"""
req = requests.get(target)
html = req.text
bf = BeautifulSoup(html)
content = bf.find_all('div', id='content')
content = content[0].text.replace('\xa0'*4, '\n\n')
return content
def writer(self, name, path, content):
"""
将爬取的内容写入文件
Arguments:
name {[type]} -- 章节名称
path {[type]} -- 路径
content {[type]} -- 章节内容
"""
with open(path, 'a', encoding='utf-8') as f:
f.write(name+'\n')
f.writelines(content)
f.write('\n\n')
if __name__ == '__main__':
down = Downloader()
down.get_download_url()
path = './赝太子(荆柯守).txt'
print(u"《赝太子》开始下载")
for i in range(down.nums):
down.writer(down.namelist[i], path, down.get_contents(down.urls[i]))
if i %10 == 0:
sys.stdout.write(u"已下载:%.3f%%" % float(i/down.nums*100) + '\r')
sys.stdout.flush()
print(u"《赝太子》下载完成")