学习python以来的第一爬虫,参考《Python3 网络爬虫开发实践》
爬取目标为:猫眼电影Top100榜
完全是依葫芦画瓢,首先请求网页内容,然后在通过正则表达式提取自己感兴趣的数据,最后转成json格式存入文本文件。
import json
import time
import requests
import re
def get_one_page(url):
headers = {
'User-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
}
resp = requests.get(url, headers=headers)
if resp.status_code == 200:
return resp.text
return None
def parse_page(html):
pattern = re.compile('.*?board-index.*?>(.*?).*?data-src="(.*?)".*?'
'name.*?(.*?).*?star">(.*?).*?releasetime">(.*?).*? ', re.S)
items = re.findall(pattern, html)
for item in items:
yield {
'index': item[0],
'img': item[1],
'title': item[2].strip(),
'actor': item[3].strip()[3:] if len(item[3]) > 3 else '',
'time': item[4].strip()[5:] if len(item[4]) > 5 else ''
}
def main(offset):
url = 'http://maoyan.com/board/4?offset=' + str(offset)
html = get_one_page(url)
# print(html)
for itm in parse_page(html):
# print(itm)
write_to_file(itm)
def write_to_file(content):
with open("result.txt", 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
if __name__ == '__main__':
for i in range(10):
main(i * 10)
time.sleep(1)