最基本的爬虫项目。

import requests
import re
import json

获取二进制流

def get_image(url):
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}

response = requests.get(url, headers=headers)
if response.status_code == 200:
    return response.content
return None

获取网页

def get_page(page):
url = 'https://maoyan.com/board/4?offset=%d' % (page * 10)
print(url)

headers =  {
    "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)" 
}

response = requests.get(url, headers=headers)
if response.status_code == 200:
    return response.content.decode('utf-8')
return None

解析网页

def parse_page(html):
result = []

# 片名
pattern = re.compile('movieId:.*?>.*?(.*?)

', re.S) actors = re.findall(pattern, html) actors = [ actor.strip() for actor in actors ] # print(actors) # 上映时间 pattern = re.compile('

(.*?)

', re.S) releasetimes = re.findall(pattern, html) releasetimes = [ releasetime.strip() for releasetime in releasetimes ] # print(releasetimes) # 评分 pattern = re.compile('

(.*?)(.*?)

', re.S) scores = re.findall(pattern, html) scores = [ ''.join(score) for score in scores ] # print(scores) # 排名 pattern = re.compile('(.*?)', re.S) ranks = re.findall(pattern, html) # print(ranks) # 图片链接 pattern = re.compile('movieId:.*?>.*?

保存数据

def save_json(result):
result_str = json.dumps(result, ensure_ascii=False)
with open('maoyan3.json', 'w', encoding='utf-8') as f:
f.write(result_str)

保存图片

def save_image(url):
image_content = get_image(url)
filename = url.split('/')[-1].split('@')[0]
filepath = './images/%s' % filename
with open(filepath, 'wb') as f:
f.write(image_content)

def main():
all_result = []
for page in range(10):
print('page: %d' % (page + 1))
html = get_page(page)
# print(html)
one_page_result = parse_page(html)
all_result.extend(one_page_result)

print(all_result)
save_json(all_result)

if name == 'main':
main( )

获得的数据是json数据通过这个网站http://www.bejson.com/jsoneditoronline/ 可以转换成一般数据。

你可能感兴趣的:(最基本的爬虫项目。)