模拟登陆豆瓣,并爬取豆瓣电影简评

模拟登陆豆瓣,并爬取豆瓣电影简评

本人非计算机专业,个人兴趣爱好,这段代码用了4天写完的,其中还有很多可以优化的,有时间我会继续优化代码
1、其中有一些库我忘记用没用到,懒得去检查,就都导入了
2、其中有些代码可以优化,比如‘getShortCommentInformation1’函数,博主又懒了,下次我会修改的
3、这样的代码比较容易被封IP,我已经被封了【捂脸】。可以试试动态ip池,由于这个没有学过,只能后期有时间边学边做,把代码补充上去

import requests
import sys
import importlib
import re
from bs4 import BeautifulSoup
import traceback
import random
import time
import xlwt

# 定义所需函数
def getHTMLText(req,url):
    try:
        tl = random.uniform(2,5)
        time.sleep(tl)
        req = req
        r = req.get(url)
        #print(r.status_code)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print('getHTMLText出错')

def getShortCommentInformation1(req,lst,commentURL):
    html = getHTMLText(req,commentURL)
    #print(html)
    soup = BeautifulSoup(html,'html.parser')
    div = soup.find_all('div',class_='comment-item')
    #print(div)
    for i in div:
        try:
            #print(i)
            # 一级网页信息
            a = i.find_all('span',class_='votes')
            Votes = a[0].string
            #print(Votes)
            href = i.find_all('a')
            #print(href[0].attrs['href'])
            perstar = i.h3.find_all('span',class_='comment-info')
            #print(perstar)
            perstar1 = perstar[0].contents[5].attrs['class'][0]
            #print(perstar1)
            star = re.search(r'\d',perstar1).group(0)
            #print(star)
            p = i.find_all('span',class_='short')
            #print(p[0].span.string)
            
            # 二级网页信息
            html1 = getHTMLText(req,href[0].attrs['href'])
            #print(html1)
            soup1 = BeautifulSoup(html1,'html.parser')
            t =soup1.find_all('div',class_='user-info')
            t1 = re.search(r'20[0,1]\d',t[0].text).group(0)
            time = 2019 - int(t1)
            #print(time)
            link0 = 'https://www.'+re.search(r'douban.+',href[0].attrs['href']).group(0)+'rev_contacts'
            #print(link0)
            f = soup1.find_all('a',href=link0)
            #print(f)
            fans = re.search(r'\d+',f[0].text).group(0)
            #print(fans)
            link = 'https://movie.'+re.search(r'douban.+',href[0].attrs['href']).group(0)+'collect'
            #print(link)
            m = soup1.find_all('a',href=link)
            #print(m)
            movies = re.search(r'\d+',m[0].text).group(0)
            #print(movies)
            
            # 二级数组,存储每个人的信息
            z = [Votes,star,p[0].string,time,fans,movies]
            
            # 一级数组,存储爬取到的所有信息
            lst.append(z)
        except:
            continue
        
def main():
    # 模拟登陆
    req = requests.Session()

    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
    url = 'https://accounts.douban.com/j/mobile/login/basic'
    data = {
            'ck': '',
            'name':'15958846537',
            'password':'zhengshujin54101',
            'remenber':'false',
            'ticket':''
            }
    r = req.post(url, data, headers=headers,timeout=8)
    
    # 爬取网页内容
    slist = []
    ye = int(input('需要爬取的页数:'))+1
    for i in range(1,ye):
        x = random.uniform(0,1)
        time.sleep(x)
        commentURL = 'https://movie.douban.com/subject/26266893/comments?start='+str((int(i)-1)*20)+'&limit=20&sort=new_score&status=P'
        #print(commentURL)
        getShortCommentInformation1(req,slist,commentURL)
        
   # 写入Excel
    we = xlwt.Workbook()    
    sheet1 = we.add_sheet(u'豆瓣流浪地球短评',cell_overwrite_ok=True)
    rowTitle = [u'编号',u'短评被点赞数',u'星级',u'短评',u'评论人用豆瓣时间','粉丝数',u'评论人看过电影的数量']
    rowDatas = slist
    
    for i in range(0,len(rowTitle)):
        sheet1.write(0,i,rowTitle[i])
    
    for k in range(0,len(rowDatas)):    #先遍历外层的集合,即每行数据
        rowDatas[k].insert(0,k+1)   #每一行数据插上编号即为每一个人插上编号
        for j in range(0,len(rowDatas[k])): #再遍历内层集合
            sheet1.write(k+1,j,rowDatas[k][j]) #写入数据,k+1表示先去掉标题行,另外每一行数据也会变化,j正好表示第一列数据的变化,rowdatas[k][j] 插入数据
    we.save('C:/Users/99364/Desktop/python/wandering_earth.xlsx')
    
if __name__ == '__main__':
    main()

你可能感兴趣的:(模拟登陆豆瓣,并爬取豆瓣电影简评)