开发环境为 Python3.6 ,爬虫项目全部内容索引目录
看懂Python爬虫框架,所见即所得一切皆有可能
本文介绍动态页面和Ajax渲染页面数据抓取的示例,以及相应的页面分析过程,你会发现本来想想复杂的网页爬虫居然比那些非动态网页的抓取要更简单。
虽说不会敲代码的 Python数据分析师 不是好的数据分析师,但你不是正儿八经的开发人员,代码敲的那么溜有什么用?学点数据爬虫基础能让繁琐的数据CV工作(Ctrl+C,Ctrl+V)成为自动化就足够了。
现在越来越多的网页的原始HTML文档不包括任何数据,而是采用Ajax统一加载。发送Ajax请求道网页更新的过程:
打开浏览器的开发者工具,到Networkk选项卡,使用XHR过滤工具。需要按照对应all_config_file.py文件建立对应相应文件夹修改该配置并且开启相关服务。
all_config_file.py
#coding=utf-8
__author__ = 'Mr数据杨'
__explain__ = '各目标网站爬虫脚本配置文件'
#加载引用模块
import time
import pymongo
import pandas as pd
def news_page_num():
page_num=input("输入每个网站页面爬取的页面数:")
return int(page_num)
def title_error_num():
title_error_num=input("输入错误标题爬取最大数:")
return int(title_error_num)
def body_error_num():
body_error_num=input("输入错误页面爬取最大数:")
return int(body_error_num)
def mongodb_client():
# 获取mongoClient对象
client = pymongo.MongoClient("localhost", 27017)
# 获取使用的database对象
db = client.news
print("加载MongoDB数据库完毕......")
return db
db=mongodb_client()
def time_today():
# 全局函数
time_today = time.strftime('%Y-%m-%d', time.localtime(time.time()))
print("加载全局日期函数完毕......")
return time_today
# 错误日志信息
def error_text_title(text,time_today):
print("加载错误信息日志完毕......")
with open("logs/" + time_today + " news_title_error.txt", "a") as f:
f.write(text + '\n')
# 错误日志信息
def error_text_body(text,time_today):
with open("logs/" + time_today + " news_body_error.txt", "a") as f:
f.write(text + '\n')
# 找到每个爬取网页的链接
def get_title_links_from_MongoDB(label, type):
result = []
for item in db.news_tmp.find({
'label': label, 'type': type}, {
'url': 1, '_id': 1}):
result.append(item)
result = pd.DataFrame(result, columns=['url', '_id'])
return result
主程序
#加载引用模块
import urllib
import urllib.request
import requests
import datetime
from bs4 import BeautifulSoup
import all_config_file
from all_config_file import error_text_title
from all_config_file import error_text_body
from all_config_file import get_title_links_from_MongoDB
cqcoal = "http://news.cqcoal.com/manage/newsaction.do?method:webListPageNewsArchivesByTypeid"
print("加载目标网址完毕......")
db = all_config_file.mongodb_client()
time_today = all_config_file.time_today()
def cqcoal_title_start(num):
def start_type(url, label, typeid, pagenum, type):
try:
page_num = 1
while page_num <= pagenum:
print("开始爬取:" + url)
page_num += 1
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {
'User-Agent': user_agent}
req = urllib.request.Request(url, headers=headers)
response = requests.get(url, headers=headers, timeout=10)
post_param = {
'pageNum': pagenum, 'pageSize': '20', 'jsonStr': typeid}
# post_param = post_param.format(typeid)
return_data = requests.post(url, data=post_param, verify=False)
content = return_data.text
# print(content)
if label == 'news.cqcoal.com':
one_page = get_cqcoal_page_news(content, type)
print('新闻抓取完毕')
except:
error = str(url+ " label:" + label + " gd:" + str(typeid) + " pagenum:" + str(pagenum) + " type:" + type + ' 未抓取到')
error_text_title(error, time_today)
print (error)
def get_cqcoal_page_news(content, type):
l = content.split("},{")
for i in [*range(len(l))]:
url = "http://news.cqcoal.com/blank/nc.jsp?mid=" + l[i][l[i].find("id") + 4:l[i].find("typeid") - 2]
title = l[i][l[i].find("title") + 8:l[i].find("shorttitle") - 3]
typename = l[i][l[i].find("typename") + 11:l[i].find("typeid2") - 3]
timeStamp = l[i][(l[i].find("pubdate") + 10):(l[i].find("senddate") - 3)]
description = l[i][l[i].find("description") + 14:l[i].find("filename") - 3]
timeStamp = int(timeStamp)
dateArray = datetime.datetime.utcfromtimestamp(timeStamp)
pubdate = dateArray.strftime("%Y-%m-%d")
one_page = {
'title': title, 'url': url, 'date': pubdate, 'type': type, 'label': 'news.cqcoal.com'}
db.news_tmp.insert_one(one_page)
return one_page
###news.cqcoal.com
def start_Cqcoal_supply_and_demand():
start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"238"}', num, 'supply_and_demand')
def start_Cqcoal_price():
start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"234"}', num, 'price')
def start_Cqcoal_dynamic():
start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"235"}', num, 'dynamic')
def start_Cqcoal_international():
start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"236"}', num, 'international')
def start_Cqcoal_comment():
start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"14"}', num, 'comment')
def start_Cqcoal_transportation():
start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"239"}', num, 'transportation')
def start_Cqcoal_economics():
start_type(cqcoal, 'news.cqcoal.com', 'road_price}', num, 'economics')
def start_Cqcoal_policy():
start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"230"}', num, 'policy')
def start_Cqcoal_correlation():
start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"237"}', num, 'correlation')
def start_Cqcoal_expert():
start_type(cqcoal, 'news.cqcoal.com', '{"typeid":"232"}', num, 'expert')
start_Cqcoal_transportation()
start_Cqcoal_supply_and_demand()
start_Cqcoal_price()
start_Cqcoal_policy()
start_Cqcoal_international()
start_Cqcoal_expert()
start_Cqcoal_economics()
start_Cqcoal_dynamic()
start_Cqcoal_correlation()
start_Cqcoal_comment()
def cqcoal_body_start():
def get_new_body(label, type):
link_list = get_title_links_from_MongoDB(label, type)
if label == 'news.cqcoal.com':
try:
for url in link_list['url']:
news_body, news_body_1 = get_news_Cqcoal_text(url,label, type)
if news_body is not None:
db.news_tmp.update({
'url': url}, {
"$set": {
'newsbody': news_body}})
db.news_tmp.update({
'url': url}, {
"$set": {
'newsbody_1': news_body_1}})
print("网站:" + label + " 类型:" + type + "内容爬取完毕!")
except:
error = str(url + " error:" + ' label:' + label + " type:" + type)
#error_text(error)
print(error)
def get_news_Cqcoal_text(url,label, type):
# html = urllib.request.urlopen(url,timeout=5)
id = url.split('=')[1]
url = 'http://news.cqcoal.com/manage/newsaction.do?method:getNewsAddonarticle'
post_param = {
'id': id}
# return_data = requests.post(url,data =post_param, verify = False)
try:
return_data = requests.post(url, data=post_param, verify=False, timeout=120)
except:
print("error label:", url, " Time out!")
error = str(url + " error:" + ' label:' + label + " type:" + type)
error_text_body(error, time_today)
return None, None
return_data = return_data.text
try:
newsBody = return_data[return_data.find("body") + 7:return_data.find("xh") - 3]
newsBody_1 = return_data[return_data.find("body") + 7:return_data.find("xh") - 3]
print(url + " 记录爬取完毕")
return newsBody, newsBody_1
except:
print("error label:", url, " type:", type)
error = str(url + " error:" + ' label:' + label + " type:" + type)
error_text_body(error, time_today)
return None, None
###news.cqcoal.com
def start_body_Cqcoal_transportation():
get_new_body('news.cqcoal.com', 'transportation')
def start_body_Cqcoal_supply_and_demand():
get_new_body('news.cqcoal.com', 'supply_and_demand')
def start_body_Cqcoal_price():
get_new_body('news.cqcoal.com', 'price')
def start_body_Cqcoal_policy():
get_new_body('news.cqcoal.com', 'policy')
def start_body_Cqcoal_international():
get_new_body('news.cqcoal.com', 'international')
def start_body_Cqcoal_expert():
get_new_body('news.cqcoal.com', 'expert')
def start_body_Cqcoal_dynamic():
get_new_body('news.cqcoal.com', 'dynamic')
def start_body_Cqcoal_economics():
get_new_body('news.cqcoal.com', 'economics')
def start_body_Cqcoal_correlation():
get_new_body('news.cqcoal.com', 'correlation')
def start_body_Cqcoal_comment():
get_new_body('news.cqcoal.com', 'comment')
start_body_Cqcoal_transportation()
start_body_Cqcoal_supply_and_demand()
start_body_Cqcoal_price()
start_body_Cqcoal_policy()
start_body_Cqcoal_international()
start_body_Cqcoal_expert()
start_body_Cqcoal_economics()
start_body_Cqcoal_dynamic()
start_body_Cqcoal_correlation()
start_body_Cqcoal_comment()
浏览器渲染引擎:渲染引擎的职责就是渲染,即在浏览器窗口中显示所请求的内容。浏览器向服务器发送请求,得到服务器返回的资源文件后,经过需要渲染引擎的处理,将资源文件显示在浏览器窗口中。
目前使用较为广泛的渲染引擎有两种:
渲染主流程:渲染引擎首先通过网络获得所请求文档的内容,通常以8K分块的方式完成。下面是渲染引擎在取得内容之后的基本流程:
渲染动态网页,有两种选择:
import warnings
warnings.filterwarnings("ignore")
import time
import requests
import pymongo
import pandas as pd
import re
from lxml import etree
# 获取mongoClient对象
client = pymongo.MongoClient("localhost", 27017)
# 获取使用的database对象
db = client.news
today=time.strftime('%Y.%m.%d',time.localtime(time.time()))
def main(page_num):
#开始爬取数据设定抓取页面数字
n=int(page_num)
def start_crawler(pro,col,adress):
i=1
while i<= n:
t = time.time()
url = "https://36kr.com/api/search-column/{}?per_page=20&page={}&_=".format(pro,i)+str(int(t))
i+=1
post_param = {
'per_page':'20',\
'page':str(i),\
'_':int(t)}
time.sleep(2)
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {
'User-Agent' : user_agent }
return_data = requests.get(url,data =post_param, verify = False)
one_page = get_news_link_to_mongodb(return_data,col,url,adress)
print (adress+" "+col+' 新闻标题抓取完毕')
def get_news_link_to_mongodb(return_data,col,url,adress):
content = return_data.json().get('data').get('content')
for i in return_data.json().get('data').get('items'):
one_page = {
'title': i["title"], \
'url_html': "https://36kr.com/p/"+str(i["id"])+".html",\
'url_json': "https://36kr.com/p/"+str(i["id"])+".json",\
'summary':i["summary"],\
'tags': re.sub(r'[0-9"\[\]]','',i["extraction_tags"]).replace(",,"," ").replace(",",""),\
'label':col,\
'adress':adress,\
'work_date':time.strftime('%Y.%m.%d',time.localtime(time.time()))}
db.kr36.insert_one(one_page)
print("已经完成抓取 "+adress+" "+col+" 板块页面:"+url)
def news_body_start(label,adress,today):
url_list = []
for item in db.kr36.find({
'label': label,'adress':adress,'work_date':today}, {
'url_json': 1}):
url_list.append(item)
url_list = pd.DataFrame(url_list, columns=['url_json'])
for i in url_list["url_json"]:
html = requests.get(i)
a=html.json().get('props').get('detailArticle|post').get('content')
sel = etree.HTML(a)
clear_content = sel.xpath('string(//*)')
db.kr36.update({
'url_json': i}, {
"$set": {
'newsbody': clear_content}})
print(i+" 抓取完毕")
print (adress+" "+label+" "+"新闻主体抓取完毕")
start_crawler('23','大公司',"36kr")
start_crawler('221','消费',"36kr")
start_crawler('225','娱乐',"36kr")
start_crawler('218','前沿技术',"36kr")
start_crawler('219','汽车交通',"36kr")
start_crawler('208','区块链',"36kr")
start_crawler('103','技能get',"36kr")
news_body_start("大公司","36kr",today)
news_body_start("消费","36kr",today)
news_body_start("娱乐","36kr",today)
news_body_start("前沿技术","36kr",today)
news_body_start("汽车交通","36kr",today)
news_body_start("区块链","36kr",today)
news_body_start("技能get","36kr",today)
#导出模块
name = {
'adress':'36kr'}
search_res = db.kr36.find(name)
list_=[]
for i in search_res:
list_.append(i)
ddf=pd.DataFrame(list_,columns=["title","url_html","tags","labels","adress","summary","newsbody"])
ddf.to_csv("36氪首页news.csv",encoding="utf_8_sig")
if __name__ == '__main__':
page_num=input("输入需要抓取的页数")
main(page_num)