【python 爬虫】豆瓣评论全爬取含展开

目录

  • 用到的模块
  • 源代码
  • 数据清洗

学会抓包,搞到一切。

用到的模块

源代码

from urllib import request
import time
import re
import os
os.mkdir(r'C:\Users\*\Desktop\PYhomework\c800')
search_counts = 800
url = 'https://movie.douban.com/subject/2353023/reviews'
headers = {***}
        
headers['Referer'] = 'https://movie.douban.com/subject/***/'
i = 0
lists = []
for count in range(0, search_counts, 20):
    url = url + "?start=" + str(count)
    req = request.Request(url, headers=headers)
    response = request.urlopen(req)
    HTML = response.read()
    HTML = HTML.decode("utf-8")
    pattern = re.compile("
") lists = pattern.findall(HTML) + lists '''爬取实际评论''' headers[ 'Cookie'] = '***' headers['Host'] = 'movie.douban.com' headers['Sec-Fetch-Dest'] = 'document' headers['Sec-Fetch-Mode'] = 'navigate' headers['Sec-Fetch-Site'] = 'none' headers['Sec-Fetch-User'] = '?1' headers['Upgrade-Insecure-Requests'] = '1' print('爬取成功!') for id in lists: i += 1 url = 'https://movie.douban.com/j/review/' + id + '/full' req = request.Request(url, headers=headers) response = request.urlopen(req) comment = response.read() comment = comment.decode("utf-8") with open(r"C:\Users\*\Desktop\PYhomework\c800\comment%d.txt" % i, mode="w", encoding="utf-8") as c: c.write(comment) print("comment%d保存成功!" % i) time.sleep(0) # 随缘设置 print("抓取完成!")

爬取的结果
【python 爬虫】豆瓣评论全爬取含展开_第1张图片

数据清洗

import os
import re
os.mkdir("c800ok")
for i in range(1, 801):
    with open(r"C:\Users\*\Desktop\PYhomework\c800\comment%d.txt" % i, encoding="utf-8") as f:
        f = f.read()
        start = f.find(",\"html\":\"")
        end = f.find('\"}')
        strings = f[start+9: end]
        strings = re.sub('
'
, '\n', strings) strings = strings.replace('\\t', '\n') strings = strings.replace(' ', '') with open(r'C:\Users\*\Desktop\PYhomework\c800ok\clear_comment%d.txt' % i, mode="w", encoding="utf-8") as fs: fs.write(strings) print('写入成功!')

清洗结果
【python 爬虫】豆瓣评论全爬取含展开_第2张图片
源码:https://tominochick.github.io/

你可能感兴趣的:(Python自学笔记)