request以及bs4库爬取豆瓣电影前100

import requests
import re
from bs4 import BeautifulSoup

def getHtmlText(url, ulist):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    movie_title = soup.find_all('span',class_='title')
    movie_link = soup.find_all('div', class_='hd')
    linkinfo = []
    for link in movie_link:
        linkinfo.append(link.a.attrs['href'])

    movie_rate = soup.find_all('span',class_='rating_num')
    movie_intro = soup.find_all('span',class_='inq')
    infolist = []
    for title in movie_title: #剔除标题中的冗杂信息 例如港台译名
        if title.text.find('/')== -1:
            infolist.append(title.string)

    for i in range(len(movie_link)):
        ulist.append([infolist[i], linkinfo[i], movie_rate[i].text, movie_intro[i].text])

def printText(ulist):
#     print('名称\t\t\t评分\t\t\t\t链接')
    for i in range(len(ulist)):
        u = ulist[i]
#         print(u[0],'\t\t', u[2],'\t\t', u[1],'\n\n一句话短评: ', u[3],'\n')
        print1('名称', u[0])
        print1('评分', u[2])
        print1('链接', u[1])
        print1('短评', u[3])
#....有点傻,下次研究下怎么自动把爬出来的东西保存以及如何合理制表

def print1(name, info):
    print('{}:{}'.format(name, info))

def main():
    ulist = []
    for i in range(4):
        url = 'https://movie.douban.com/top250?start={}'.format(i*25)
        getHtmlText(url, ulist)
        printText(ulist)
main()

在看完网课后首次编写爬虫,哎,感觉好笨,参考了很多博客。因为经验不足出现很多简单的错误,比如turple out of index, list is not callable等等。下次目标是爬取电影短评以及将数据合理的保存。

你可能感兴趣的:(爬虫)