https://www.jianshu.com/c/b4d0bf551689
进行获取这些数据
在第一次进来的时候发现他有一个无线下拉的列表要将所有的列表动态加载出来
browser=webdriver.Chrome()
browser.get(url)
browser.execute_script("""
(function () {
var y = 0;
var step = 100;
window.scroll(0, 0);
function f() {
if (y < document.body.scrollHeight) {
y += step;
window.scroll(0, y);
setTimeout(f, 100);
} else {
window.scroll(0, 0);
document.title += "scroll-done";
}
}
setTimeout(f, 1000);
})();
""")
while True:
if "scroll-done" in browser.title:
break
else:
print('正在下拉')
下面就开始获取要得到的数据:
开始分析如何得到这两个属性:
按F12
下面所有你想获取的数据同理
# -*- encoding:utf-8 -*-
import requests
from selenium import webdriver
from bs4 import BeautifulSoup
import re
import time
import pymysql
def getHtml(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36 Edg/84.0.522.44'
}
respon = requests.get(url, headers=headers)
return respon.text
def get_Details(url):
#打开浏览器
browser=webdriver.Chrome()
#将要获取的连接进行连接
browser.get(url)
#开始进行滑动窗口至最底层
browser.execute_script("""
(function () {
var y = 0;
var step = 100;
window.scroll(0, 0);
function f() {
if (y < document.body.scrollHeight) {
y += step;
window.scroll(0, y);
setTimeout(f, 100);
} else {
window.scroll(0, 0);
document.title += "scroll-done";
}
}
setTimeout(f, 1000);
})();
""")
print("开始下拉.....")
# time.sleep(180)
#做一个标记,是否已经滑到最底层
count=False
while True:
#第一次进来为False不会结束
if count==True:
break
#判断是否已经到最底层如果到了那么就开始获取数据
if "scroll-done" in browser.title:
#获取当前页面的源代码
page_html = browser.page_source
#对源代码进行解析
page_soup = BeautifulSoup(page_html, 'html.parser')
#获取所有的文章的父标签
note_list = page_soup.find('ul', attrs={'class': 'note-list'})
#如果不为空
if note_list != None:
#开始获取所有的文章
li_All = note_list.findAll('li')
#这里是为了进入每一个文档的详情页面再次打开一个浏览器
page_chrome=webdriver.Chrome()
#开始对每一个文章进行遍历
for li in li_All:
sql={'id':'NULL',
'content_name': '',
'content_href':'',
'nice':'',
'chapter':'',
'article':'',
'abstract':''
}
if li.find('div', attrs={'class': 'content'})!=None:
if li.find('div', attrs={'class': 'content'}).find('a')!=None:
#获取文章的标题名称
content_name=li.find('div', attrs={'class': 'content'}).find('a').text#标题名称
sql['content_name']=content_name
#获取文章的地址,为了方便进入下一级,详情页面
content_href=f"https://www.jianshu.com{li.find('div', attrs={'class': 'content'}).find('a')['href']}"#详情地址
sql['content_href']=content_href
print("所在文章地址:",content_href)
#根据刚刚得到的文章详情页面的地址连接开始获取
page_chrome.get(content_href)
time.sleep(3)
#获取每一个详情页面的源代码
details_html=page_chrome.page_source
#开始解析
details_soup=BeautifulSoup(details_html,'html.parser')
#获取点赞的数值
pnjry=details_soup.find('div',attrs={'class':'_3Pnjry'})
if pnjry!=None:
puukr=pnjry.find('div',attrs={'class':'_1pUUKr'})
if puukr!=None:
png=puukr.find('div',attrs={'class':'P63n6G'})
if png!=None:
pnwj=png.find("span")
sql['nice']=pnwj.text
#开始获取每一个章节
rhmja=details_soup.find('article',attrs={'class':'_2rhmJa'})
p_list=rhmja.findAll('p')
if p_list==None:
continue
p_name=[]
p_articles_Numpy=[]
for p in p_list:
if str(p).__contains__('丛铭'):
break
p_article_S_ALL = ""
time.sleep(2)
a=p.find('a')
if a ==None:
continue
if a!=None:
a_name=a.text
p_name.append(a_name)
#获取每一个章节的内容页面
a_href=a['href']
if not str(a_href).__contains__('https://www.jianshu.com/p'):
continue
page_chrome.get(a_href)
article_html=page_chrome.page_source
#如果此连接已失效就进行判断不获取
if not str(article_html).__contains__('抱歉,你访问的页面不存在。'):
article_soup = BeautifulSoup(article_html, 'html.parser')
article=article_soup.find('article',attrs={'class':'_2rhmJa'})
p_article_All=article.findAll('p')
if p_article_All==None:
continue
for p_article in p_article_All:
p_article_S_ALL+=p_article.text
p_articles_Numpy.append(p_article_S_ALL)
# print(p_articles_Numpy)
# print(p_name)
sql['chapter']=p_name
sql['article']=p_articles_Numpy
if li.find('div', attrs={'class': 'content'}).find('p',attrs={'class':'abstract'})!=None:
abstract=str(li.find('div', attrs={'class': 'content'}).find('p', attrs={'class': 'abstract'}).text)#标题内容
abstract=abstract.strip("")
sql['abstract']=abstract
# print(abstract)
print(sql)
conneMysql(sql)
#当所有的数据获取完之后改为True就不会重复的进入了
count=True
else:
print("下拉中...")
def conneMysql(sql):
conn= pymysql.connect(
host='localhost',
user='root',
password='root',
db='test',
charset='utf8',
autocommit=True, # 如果插入数据,, 是否自动提交? 和conn.commit()功能一致。
)
cur= conn.cursor()
insert = 'insert into `jianshu_table` values ('
for item in sql:
if item == 'chapter':
chapter = str(sql[item]).replace("[", "").replace("]", "").replace(",", "|").replace("'","")
insert += f"'{chapter}',"
elif item == 'article':
article = str(sql[item]).replace("[", "").replace("]", "").replace(",", "|").replace("'","")
insert += f"'{article}',"
elif item == 'id':
insert += f"{sql[item]},"
elif item != 'abstract':
insert += f"'{sql[item]}',"
else:
insert += f"'{str(sql[item]).strip()}');"
try:
insert_sqli = insert
cur.execute(insert_sqli)
except Exception as e:
print("插入数据失败:", e)
else:
# 如果是插入数据, 一定要提交数据, 不然数据库中找不到要插入的数据;
conn.commit()
print("插入数据成功;")
if __name__ == '__main__':
url='https://www.jianshu.com/c/b4d0bf551689'
get_Details(url)
DROP TABLE IF EXISTS `jianshu_table`;
CREATE TABLE `jianshu_table` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`content_name` varchar(500) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`content_href` varchar(500) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`nice` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`chapter` varchar(2000) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
`atticle` longtext CHARACTER SET utf8 COLLATE utf8_general_ci NULL,
`abstract` longtext CHARACTER SET utf8 COLLATE utf8_general_ci NULL,
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 2 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Compact;