【学习笔记】Python爬虫-豆瓣电影所有短评

用到的包:

1.requests获取网页内容(中文api:http://docs.python-requests.org/zh_CN/latest/user/quickstart.html#id2)

2.bs4解析网页(中文api:http://beautifulsoup.readthedocs.io/zh_CN/latest/)

3.pymysql连接数据库存储信息

遇到的问题:

1.bs4可以快速筛选文字
userName = soup.select(‘span.comment-info > a’)可以筛选出网页中中的a标签.
此时userName 为列表class ‘list’,其中的内容为class bs4.element.Tag
我们可以通过userName[i].string快速筛选a标签中的文字

完整代码

import requests
from bs4 import BeautifulSoup
import re
import pymysql


class parser(object):
    def __init__(self):
        self.comList = []

    def start(self):
        startUrl = 'https://movie.douban.com/subject/1291571/comments?sort=new_score&status=P'
        html = self.htmlPrase(startUrl)
        count = 1
        while count <= 10:
            print('正在爬取第 ' + str(count) + ' 页')
            self.addList(html)
            nextUrl = self.getNewUrl(html)
            html = self.htmlPrase(nextUrl)
            count = count + 1
        for com in self.comList:
            print(com['userName'])
            print(com['comments'])
        self.addToDatabase()


    def htmlPrase(self,url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
        response = requests.get(url, headers=headers)
        html = response.text
        return html



    def addList(self,html):
        soup = BeautifulSoup(html, 'html.parser')
        reg = r'

(.*)' reg = re.compile(reg) comments = re.findall(reg, html) userName = soup.select('span.comment-info > a') length = len(userName) if(length!=0): for i in range(0, length): com = {} com['userName'] = userName[i].string com['comments'] = comments[i] self.comList.append(com) def getNewUrl(self,html): soup = BeautifulSoup(html, 'html.parser') urlParent = soup.find('a', class_='next')['href'] nextUrl = 'https://movie.douban.com/subject/1291571/comments'+urlParent return nextUrl def addToDatabase(self): db = pymysql.connect( host='127.0.0.1', port=3306, user='root', password='root', db='pythontext', charset='utf8' ) cursor = db.cursor() for com in self.comList: cursor.execute("insert into douban(username, comments) VALUE ('{}', '{}')".format(com['userName'], str(com['comments']))) db.commit() db.close() if __name__ == '__main__': parser = parser() parser.start()

你可能感兴趣的:(python)