python3
依赖库如下:
pip install request pyquery beautifulsoup4
此次爬取猫眼电影排行榜,提取排名、电影名、主演、上映时间、评分、封面图片等。
https://maoyan.com/board/4?offset=0
def get_one_page(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_one_page(html):
pattern = re.compile('.*?board-index.*?>(\d+).*?data-src="(.*?)".*?name">
+'.*?>(.*?).*?star">(.*?).*?releasetime">(.*?)'
+'.*?integer">(.*?).*?fraction">(.*?).*?', re.S)
items = re.findall(pattern, html)
for item in items:
yield {
'index': item[0],
'image': item[1],
'title': item[2],
'actor': item[3].strip()[3:],
'time': item[4].strip()[5:],
'score': item[5]+item[6]
}
def parse_one_page(html):
soup=BeautifulSoup(html,'lxml')
result_id=soup.select('.board-index')
result_title=soup.select('.image-link')
result_img=soup.select('.board-img')
result_star=soup.select('.star')
result_time=soup.select('.releasetime')
result_score1=soup.select('.integer')
result_score2 = soup.select('.fraction')
for i in range(10):
yield {
'index': result_id[i].text,
'image': result_img[i].attrs['data-src'],
'title': result_title[i].attrs['title'],
'actor': result_star[i].text.strip()[3:],
'time': result_time[i].text.strip()[5:],
'score': result_score1[i].text+result_score2[i].text
}
def parse_one_page():
result=etree.parse('maoyan.html',etree.HTMLParser(encoding='utf-8'))
result_id=result.xpath('//div[@class="main"]//dl//dd/i/text()')
result_title=result.xpath('//div[@class="main"]//dl//dd/a/@title')
result_img=result.xpath('//div[@class="main"]//dl//dd//a//img[2]/@data-src')
result_star=result.xpath('//div[@class="main"]//dl//dd//p[@class="star"]/text()')
result_time=result.xpath('//div[@class="main"]//dl//dd//p[@class="releasetime"]/text()')
result_score1=result.xpath('//div[@class="main"]//dl//dd//i[@class="integer"]//text()')
result_score2=result.xpath('//div[@class="main"]//dl//dd//i[@class="fraction"]//text()')
'''
print(result_id)
print(result_title)
print(result_img)
print(result_star[0].strip()[3:])
print(result_time)
print(result_score1)
print(result_score2)
'''
for i in range(10):
yield {
'index': result_id[i],
'image': result_img[i],
'title': result_title[i],
'actor': result_star[i].strip()[3:],
'time': result_time[i].strip()[5:],
'score': result_score1[i]+result_score2[i]
}
def parse_one_page(html):
doc=pq(html)
result_id=doc('.board-index')
result_title=doc('.image-link')
result_img=doc('.board-img')
result_star=doc('.star')
result_time=doc('.releasetime')
result_score1=doc('.integer')
result_score2 = doc('.fraction')
result_id=[item.text() for item in result_id.items()]
result_img=[item.attr('data-src') for item in result_img.items()]
result_title=[item.attr('title') for item in result_title.items()]
result_star=[item.text() for item in result_star.items()]
result_time=[item.text() for item in result_time.items()]
result_score1 = [item.text() for item in result_score1.items()]
result_score2 = [item.text() for item in result_score2.items()]
'''
print(result_img)
print(result_title)
print(result_star)
print(result_id)
print(result_time)
print(result_score1[0]+result_score2[0])
'''
for i in range(10):
yield {
'index': result_id[i],
'image': result_img[i],
'title': result_title[i],
'actor': result_star[i].strip()[3:],
'time': result_time[i].strip()[5:],
'score': result_score1[i]+result_score2[i]
}
与*.py一个目录下,新建四个TXT文件。如result_bs4.txt result_pyquery.txt result_re.txt result_xpath.txt
def write_to_file(content):
with open('result_pyquery.txt', 'a', encoding='utf-8') as f:
f.write(json.dumps(content, ensure_ascii=False) + '\n')
f.close()
def main(offset):
url = 'http://maoyan.com/board/4?offset=' + str(offset)
html = get_one_page(url)
for item in parse_one_page(html):
#print(item)
write_to_file(item)
if __name__ == '__main__':
for i in range(10):
main(offset=i*10)
https://github.com/wardseptember/crawlProject/tree/master/%E7%88%AC%E5%8F%96%E7%8C%AB%E7%9C%BC%E7%94%B5%E5%BD%B1%E6%8E%92%E8%A1%8C
欢迎star,谢谢。
我的个人博客同步更新
{"index": "1", "image": "https://p1.meituan.net/movie/20803f59291c47e1e116c11963ce019e68711.jpg@160w_220h_1e_1c", "title": "霸王别姬", "actor": "张国荣,张丰毅,巩俐", "score": "9.6", "time": "1993-01-01"}
{"index": "2", "image": "https://p0.meituan.net/movie/283292171619cdfd5b240c8fd093f1eb255670.jpg@160w_220h_1e_1c", "title": "肖申克的救赎", "actor": "蒂姆·罗宾斯,摩根·弗里曼,鲍勃·冈顿", "score": "9.5", "time": "1994-10-14(美国)"}
{"index": "3", "image": "https://p0.meituan.net/movie/54617769d96807e4d81804284ffe2a27239007.jpg@160w_220h_1e_1c", "title": "罗马假日", "actor": "格利高里·派克,奥黛丽·赫本,埃迪·艾伯特", "score": "9.1", "time": "1953-09-02(美国)"}
{"index": "4", "image": "https://p0.meituan.net/movie/e55ec5d18ccc83ba7db68caae54f165f95924.jpg@160w_220h_1e_1c", "title": "这个杀手不太冷", "actor": "让·雷诺,加里·奥德曼,娜塔莉·波特曼", "score": "9.5", "time": "1994-09-14(法国)"}
{"index": "5", "image": "https://p1.meituan.net/movie/f5a924f362f050881f2b8f82e852747c118515.jpg@160w_220h_1e_1c", "title": "教父", "actor": "马龙·白兰度,阿尔·帕西诺,詹姆斯·肯恩", "score": "9.3", "time": "1972-03-24(美国)"}
{"index": "6", "image": "https://p1.meituan.net/movie/0699ac97c82cf01638aa5023562d6134351277.jpg@160w_220h_1e_1c", "title": "泰坦尼克号", "actor": "莱昂纳多·迪卡普里奥,凯特·温丝莱特,比利·赞恩", "score": "9.5", "time": "1998-04-03"}
{"index": "7", "image": "https://p0.meituan.net/movie/da64660f82b98cdc1b8a3804e69609e041108.jpg@160w_220h_1e_1c", "title": "唐伯虎点秋香", "actor": "周星驰,巩俐,郑佩佩", "score": "9.2", "time": "1993-07-01(中国香港)"}
{"index": "8", "image": "https://p0.meituan.net/movie/b076ce63e9860ecf1ee9839badee5228329384.jpg@160w_220h_1e_1c", "title": "千与千寻", "actor": "柊瑠美,入野自由,夏木真理", "score": "9.3", "time": "2001-07-20(日本)"}
{"index": "9", "image": "https://p0.meituan.net/movie/46c29a8b8d8424bdda7715e6fd779c66235684.jpg@160w_220h_1e_1c", "title": "魂断蓝桥", "actor": "费雯·丽,罗伯特·泰勒,露塞尔·沃特森", "score": "9.2", "time": "1940-05-17(美国)"}
{"index": "10", "image": "https://p0.meituan.net/movie/230e71d398e0c54730d58dc4bb6e4cca51662.jpg@160w_220h_1e_1c", "title": "乱世佳人", "actor": "费雯·丽,克拉克·盖博,奥利维娅·德哈维兰", "score": "9.1", "time": "1939-12-15(美国)"}
https://github.com/wardseptember/crawlProject/tree/master/%E7%88%AC%E5%8F%96%E7%8C%AB%E7%9C%BC%E7%94%B5%E5%BD%B1%E6%8E%92%E8%A1%8C