1.requests获取网页内容(中文api:http://docs.python-requests.org/zh_CN/latest/user/quickstart.html#id2)
2.bs4解析网页(中文api:http://beautifulsoup.readthedocs.io/zh_CN/latest/)
3.pymysql连接数据库存储信息
1.bs4可以快速筛选文字
userName = soup.select(‘span.comment-info > a’)可以筛选出网页中中的a标签.
此时userName 为列表class ‘list’,其中的内容为class bs4.element.Tag
我们可以通过userName[i].string快速筛选a标签中的文字
import requests
from bs4 import BeautifulSoup
import re
import pymysql
class parser(object):
def __init__(self):
self.comList = []
def start(self):
startUrl = 'https://movie.douban.com/subject/1291571/comments?sort=new_score&status=P'
html = self.htmlPrase(startUrl)
count = 1
while count <= 10:
print('正在爬取第 ' + str(count) + ' 页')
self.addList(html)
nextUrl = self.getNewUrl(html)
html = self.htmlPrase(nextUrl)
count = count + 1
for com in self.comList:
print(com['userName'])
print(com['comments'])
self.addToDatabase()
def htmlPrase(self,url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
response = requests.get(url, headers=headers)
html = response.text
return html
def addList(self,html):
soup = BeautifulSoup(html, 'html.parser')
reg = r'(.*)'
reg = re.compile(reg)
comments = re.findall(reg, html)
userName = soup.select('span.comment-info > a')
length = len(userName)
if(length!=0):
for i in range(0, length):
com = {}
com['userName'] = userName[i].string
com['comments'] = comments[i]
self.comList.append(com)
def getNewUrl(self,html):
soup = BeautifulSoup(html, 'html.parser')
urlParent = soup.find('a', class_='next')['href']
nextUrl = 'https://movie.douban.com/subject/1291571/comments'+urlParent
return nextUrl
def addToDatabase(self):
db = pymysql.connect(
host='127.0.0.1',
port=3306,
user='root',
password='root',
db='pythontext',
charset='utf8'
)
cursor = db.cursor()
for com in self.comList:
cursor.execute("insert into douban(username, comments) VALUE ('{}', '{}')".format(com['userName'], str(com['comments'])))
db.commit()
db.close()
if __name__ == '__main__':
parser = parser()
parser.start()