我自己爬——Beautiful Soup/XPath/正则三种方式爬取豆瓣电影top250
准备将自己之前用Beautiful Soup乱写的豆瓣爬虫作为Python大作业交上去,结果发现要求用正则orz...
于是便有了这篇——用三种不同的方式来爬豆瓣电影top250
爬取url: https://movie.douban.com/top250
观察页面结构不难发现这250条记录分布在10页上,每页25条,于是,可以找到url的规律:
for offset in range(0, 250, 25):
url = 'https://movie.douban.com/top250?start=' + \
str(offset) + '&filter='
接下来还是老套路,先爬一爬看看html(豆瓣有反爬,带上响应头)
def get_url(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
分别用三种不同的方式爬:
BeautifulSoup:
def html_parser(url):
html = get_url(url)
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all(name='li')
content = []
for item in items[18:]:
index = item.find(name='em').get_text()
title = item.find(name='span', attrs='title').get_text()
rating = item.find(name='span', attrs='rating_num').get_text()
link = item.find(name='a')['href']
tmp_tuple = (index, title, rating, link)
content.append(tmp_tuple)
return content
XPath:
def html_parser(url):
html = get_url(url)
tree = etree.HTML(html)
rank = tree.xpath('//em[@class]/text()') # 排名
name = tree.xpath('//div[@class="hd"]//a[@href]//span[1]/text()') # 标题
rating_num = tree.xpath('//span[@class="rating_num"]/text()') # 评分
link = tree.xpath('//div[@class="hd"]//a[@href]/@href') # 链接
return zip(rank, name, rating_num, link)
正则:(我怎么写的这么复杂)
def html_parser(url):
html = get_url(url)
Patt_index = re.compile('(.*?)') # 排名
Patt_name = re.compile('(.*?)') # 电影名称
Patt_rating = re.compile(
' ') # 评分
Patt_link = re.compile('') # 链接
rank = re.findall(Patt_index, html)
name = re.findall(Patt_name, html)
rating_num = re.findall(Patt_rating, html)
link = re.findall(Patt_link, html)
pro_name = []
for i in name:
if '&' not in i:
pro_name.append(i)
return zip(rank, pro_name, rating_num, link)
写入文件:
def write_movies_file(items, tplt):
with open(filename, 'a', encoding='utf-8') as f:
# f.write(items[0] + '\t' + items[1] + '\t' + items[2] + '\t' + items[3])
f.write(tplt.format(items[0], items[1],
items[2], items[3], chr(12288)))
f.write('\n')
全部代码:
BeautifulSoup:
import requests
from bs4 import BeautifulSoup
def get_url(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
def html_parser(url):
html = get_url(url)
soup = BeautifulSoup(html, 'html.parser')
items = soup.find_all(name='li')
content = []
for item in items[18:]:
index = item.find(name='em').get_text()
title = item.find(name='span', attrs='title').get_text()
rating = item.find(name='span', attrs='rating_num').get_text()
link = item.find(name='a')['href']
tmp_tuple = (index, title, rating, link)
content.append(tmp_tuple)
return content
def write_movies_file(items, tplt):
with open('movies_bs4.txt', 'a', encoding='utf-8') as f:
# f.write(items[0] + '\t' + items[1] + '\t' + items[2] + '\t' + items[3])
f.write(tplt.format(items[0], items[1],
items[2], items[3], chr(12288)))
f.write('\n')
def main():
tplt = "{0:^10}\t{1:^30}\t{2:^30}\t{3:30}"
for offset in range(0, 250, 25):
url = 'https://movie.douban.com/top250?start=' + \
str(offset) + '&filter='
for item in html_parser(url):
write_movies_file(item, tplt)
print(tplt.format(item[0], item[1], item[2], item[3], chr(12288)))
if __name__ == '__main__':
main()
XPath:
import requests
from lxml import etree
def get_url(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
def html_parser(url):
html = get_url(url)
tree = etree.HTML(html)
rank = tree.xpath('//em[@class]/text()') # 排名
name = tree.xpath('//div[@class="hd"]//a[@href]//span[1]/text()') # 标题
rating_num = tree.xpath('//span[@class="rating_num"]/text()') # 评分
link = tree.xpath('//div[@class="hd"]//a[@href]/@href') # 链接
return zip(rank, name, rating_num, link)
def write_movies_file(items, tplt):
with open('movies_xpath.txt', 'a', encoding='utf-8') as f:
# f.write(items[0] + '\t' + items[1] + '\t' + items[2] + '\t' + items[3])
f.write(tplt.format(items[0], items[1],
items[2], items[3], chr(12288)))
f.write('\n')
def main():
tplt = "{0:^10}\t{1:^30}\t{2:^30}\t{3:30}"
for offset in range(0, 250, 25):
url = 'https://movie.douban.com/top250?start=' + \
str(offset) + '&filter='
items = list(html_parser(url))
for item in items:
write_movies_file(item, tplt)
print(tplt.format(item[0], item[1], item[2], item[3], chr(12288)))
if __name__ == '__main__':
main()
正则:
import requests
import re
def get_url(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
def html_parser(url):
html = get_url(url)
Patt_index = re.compile('(.*?)') # 排名
Patt_name = re.compile('(.*?)') # 电影名称
Patt_rating = re.compile(
' ') # 评分
Patt_link = re.compile('') # 链接
rank = re.findall(Patt_index, html)
name = re.findall(Patt_name, html)
rating_num = re.findall(Patt_rating, html)
link = re.findall(Patt_link, html)
pro_name = []
for i in name:
if '&' not in i:
pro_name.append(i)
return zip(rank, pro_name, rating_num, link)
def write_movies_file(items, tplt):
with open('movies_re.txt', 'a', encoding='utf-8') as f:
# f.write(items[0] + '\t' + items[1] + '\t' + items[2] + '\t' + items[3])
f.write(tplt.format(items[0], items[1],
items[2], items[3], chr(12288)))
f.write('\n')
def main():
tplt = "{0:^10}\t{1:^30}\t{2:^30}\t{3:30}"
for offset in range(0, 250, 25):
url = 'https://movie.douban.com/top250?start=' + \
str(offset) + '&filter='
items = list(html_parser(url))
for item in items:
write_movies_file(item, tplt)
print(tplt.format(item[0], item[1], item[2], item[3], chr(12288)))
if __name__ == '__main__':
main()