目标网站:m.weibo.cn
url的获取可以从浏览器的F12中的network的XHR中找到。
weibo_demo.py:
import requests
import json
from w3lib.html import remove_tags
from mysqlhelper import MySQLHelper
import time
helper = MySQLHelper()
max_page = 50
#设置header
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2'
}
def get_one_page_info(url):
#需要:text,comments_count,attitudes_count,reposts_count,created_at ,source,存储到mysql中
response = requests.get(url=url , headers=headers)
#json.loads()将json的字符串转化为dict
res_dict = json.loads(response.text)
cards_list = res_dict['data']['cards']
#具体的获取数据
for card in cards_list:
if 'mblog' in card:
text = remove_tags(card['mblog']['text'])
comments_count = card['mblog']['comments_count']
attitudes_count = card['mblog']['attitudes_count']
reposts_count = card['mblog']['reposts_count']
created_at = card['mblog']['created_at']
source_a = card['mblog']['source']
# print(text,comments_count,attitudes_count,reposts_count,created_at,source_a)
insert_sql = 'INSERT INTO weibo_test (source_a, created_at, `text`, comments_count, attitudes_count, reposts_count)VALUES (%s, %s, %s, %s, %s, %s)'
data = (source_a, created_at, text, comments_count, attitudes_count, reposts_count)
helper.execute_insert_sql(insert_sql, data)
# time.sleep(1)
# CREATE TABLE wb_test(id int primary key auto_increment,source_a varchar(50),created_at varchar(40),`text` text,comments_count int,attitudes_count int,reposts_count int) default charset=utf8;
#truncate table 表名 制空表
if __name__ =='__main__':
for i in range(max_page):
print('page ' + str(i + 1) + ' has done!')
url = 'https://m.weibo.cn/api/container/getIndex?containerid=100103type%3D60%26q%3D%E6%96%B0%E5%86%A0%E7%96%AB%E6%83%85%26t%3D0&page_type=searchall%page={}'.format(i + 1)
get_one_page_info(url)
mysqlhelper.py:
import pymysql
class MySQLHelper(object):
def __init__(self):
self.conn = pymysql.connect(host='localhost', port=3306, db='wb', user='root', passwd='123456', charset='utf8')
#cursor游标,类似与yeild生成器
self.cursor = self.conn.cursor()
def execute_insert_sql(self, sql, data):
self.cursor.execute(sql, data)
self.conn.commit()
def __del__(self):
self.cursor.close()
self.conn.close()
if __name__ =='__main__':
#实例化
helper = MySQLHelper()
insert_sql = 'INSERT INTO weibo_test (source_a, created_at, `text`, comments_count, attitudes_count, reposts_count)VALUES (%s, %s, %s, %s, %s, %s)'
data = ('mi', '2020-4-22', '今天天气好', 2, 3, 5)
helper.execute_insert_sql(insert_sql, data)
2020.4.30报错:
后来在运行中发现会报pymysql.err.InternalError: (1366, "Incorrect string value: '\\xF0\\x9F\\x98\\xB7 ' for column 'text' at row 1")
错误,这个错误发生在当需要将特殊字符写入MySQL时。此时需要在mysqlhelper.py中加上self.cursor.execute("SET NAMES utf8mb4;")