python 爬虫 豆瓣 评论及评分

借鉴了不少 hang 的博客:https://segmentfault.com/a/1190000010473819

评分:

# -*- coding: utf-8 -*-
"""
Created on Wed Sep 20 16:19:02 2017


@author: su
"""


from urllib import request
import re
from bs4 import BeautifulSoup as bs
"""resp = request.urlopen("https://movie.douban.com/nowplaying/hangzhou/" )
html_data = resp.read().decode("UTF-8")
soup = bs(html_data,"html.parser")
nowplaying_movie = soup.find_all("div",id="nowplaying")
nowplaying_movie_list = nowplaying_movie[0].find_all("li",class_="list-item")
nowplaying_list = []
for item in nowplaying_movie_list:
    nowplaying_dict={}
    nowplaying_dict["id"]= item["data-subject"]
    nowplaying_dict["title"]= item["data-title"]
    for tag_img_item in item.find_all("img"):
        nowplaying_dict["name"]=tag_img_item["alt"]
        nowplaying_list.append(nowplaying_dict)
"""
for start in range(0,60,20):
    requr = "https://movie.douban.com/subject/"+nowplaying_list[0]["id"]+"/comments?"+"start="+str(start)+"&limit=20"
    resp = request.urlopen(requr)
    html_data = resp.read().decode("UTF-8")
    soup = bs(html_data,"html.parser")
    comment_list=[]
    comment_div_list = soup.find_all("div",class_="comment")
    for item in comment_div_list:
        comment_dict={}
       #if t in range(0,50,10): 
        item_score = item.find_all("h3")[0]
        item_score = item_score.find_all("span")[4]
        comment_dict["得分"]=item_score["class"]
        comment_dict["评级"]=item_score["title"]
        comment_list.append(comment_dict)
        
    print(comment_list)

评论:

@author: su
"""
from urllib import request
import re
import jieba    #分词包
import pandas as pd
import numpy 

"""
resp = request.urlopen('https://movie.douban.com/nowplaying/hangzhou/')
html_data = resp.read().decode('utf-8')
from bs4 import BeautifulSoup as bs
soup = bs(html_data, 'html.parser')    
nowplaying_movie = soup.find_all('div', id='nowplaying')
nowplaying_movie_list = nowplaying_movie[0].find_all('li', class_='list-item') 
nowplaying_list = [] 
for item in nowplaying_movie_list:        
        nowplaying_dict = {}        
        nowplaying_dict['id'] = item['data-subject']       
        for tag_img_item in item.find_all('img'):            
            nowplaying_dict['name'] = tag_img_item['alt']            
            nowplaying_list.append(nowplaying_dict)  

"""
requrl = 'https://movie.douban.com/subject/' + nowplaying_list[3]['id'] + '/comments' +'?' +'start=0' + '&limit=20' 
resp = request.urlopen(requrl) 
html_data = resp.read().decode('utf-8') 
soup = bs(html_data, 'html.parser') 
comment_div_lits = soup.find_all('div', class_='comment') 
eachCommentList = []; 
for item in comment_div_lits: 
        if item.find_all('p')[0].string is not None:     
            eachCommentList.append(item.find_all('p')[0].string)
comments = ''
with open("s.txt","w") as f:
  for k in range(len(eachCommentList)):
      comments = comments + (str(eachCommentList[k])).strip()
      pattern = re.compile(r'[\u4e00-\u9fa5]+')
      filterdata = re.findall(pattern, comments)
      cleaned_comments = ''.join(filterdata)
      f.write(cleaned_comments)
f.close()  
              

你可能感兴趣的:(python 爬虫 豆瓣 评论及评分)