如何解决爬取某时段微博数据但保存在csv文件中出现乱码问题
1、(1获取微博数据以及存储在csv中)
源代码如下
# start_chrome -> input_date -> scroll_down-> find_cards_info -> save -> find_next (goto)
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import csv
import os
# 运行前先下载 chrome driver,下载地址是:https://sites.google.com/a/chromium.org/chromedriver/downloads,点击【Latest Release: ChromeDriver x.xx】进入下载
def start_chrome():
driver = webdriver.Chrome(executable_path='./chromedriver') # Windows 需写成'./chromedriver.exe'
driver.start_client()
return driver
# weibo.com/sxbg? + start_t ...
def q(st,et):
return f'?is_ori=1&key_word=&start_time={st}&end_time={et}&is_search=1&is_searchadv=1#_0'
def scroll_down():
html_page = driver.find_element_by_tag_name('html')
# ...# form > input
for i in range(15):
print(i)
html_page.send_keys(Keys.END)
time.sleep(0.6)
def find_cards_info():
cards_sel = 'div.WB_feed_detail'
cards = driver.find_elements_by_css_selector(cards_sel)
info_list = []
for card in cards:
content_sel = 'div.WB_text.W_f14'
time_sel = 'div.WB_from.S_txt2'
link_sel = 'div.WB_from.S_txt2 > a:nth-child(1)'
content = card.find_element_by_css_selector(content_sel).text
time = card.find_element_by_css_selector(time_sel).text
link = card.find_element_by_css_selector(link_sel).get_attribute('href')
info_list.append([content,time,link])
# [[1,2,3],[4,5,6]...]
return info_list
def find_next():
next_sel = 'a.page.next'
next_page = driver.find_elements_by_css_selector(next_sel)
if next_page:
return next_page[0].get_attribute('href')
def save(info_list,name):
full_path = './' + name + '.csv' # 2018-01-02~2018-03-05.csv
if os.path.exists(full_path):
with open(full_path,'a') as f:
writer = csv.writer(f)
writer.writerows(info_list)
print('Done')
else:
with open(full_path,'w+') as f:
writer = csv.writer(f)
writer.writerows(info_list)
print('Done')
def run_crawler(base,duration):
# 2018-01-02~2018-03-05
if not base.endswith('feedtop'):
st, et = duration.split('~')
driver.get(base+q(st,et))
else:
driver.get(base)
time.sleep(5)
scroll_down()
time.sleep(5)
info_list = find_cards_info()
save(info_list,duration)
next_page = find_next()
if next_page:
run_crawler(next_page,duration)
base = 'https://weibo.com/bgsxy' #这里可以替换成你想爬取的微博用户
driver = start_chrome()
input()
run_crawler(base, '2017-06-20~2018-02-03') #这里可以替换成你想爬取的时间段
之后产生相关乱码问题
(2、如何解决在Windows系统不兼容的问题?爬取数据在csv里面产生乱码,可采用三明治方法解决)
代码如下:
import csv
import os
#网站是UTF-8编码,之后用UTF-8解码编码写入Windows电脑中,但是windowcsv默认是gbk解码,所以会产生乱码(mac系统没事,默认是utf8编码)
path = 'D:\\2017-06-20~2018-09-03.csv'#爬取数据的路径
full_path = 'D:\hsf.csv'#要保存数据的路径
info_list = []
c = open(path,'r',encoding='utf-8')#用utf8解码,因为大多数网站都是用utf8编码
read = csv.reader(c)
for line in read:
if line:
info_list.append(line)
print(line)
print('D')
if os.path.exists(full_path):
with open(full_path,'a',encoding='gb18030') as f:#用gb18030编码的话,适用于Windows系统
writer = csv.writer(f)
writer.writerows(info_list)
print('Done')
else:
with open(full_path,'w+',encoding='gb18030') as f:
writer = csv.writer(f)
writer.writerows(info_list)
print('Done')
IDE中记得重新设置一下,不然依旧会报错
说明:源代码选自麻瓜教程实用主义课程中的,同时也感谢帮我解惑的蔡坤林同学和夏老师指导!如有不当之处,敬请指正!