原标题:Python爬虫案例:分析Ajax请求并抓取今日头条街拍图片
1.抓取索引页内容
利用requests请求目标站点,得到索引网页HTML代码,返回结果。
fromurllib.parse importurlencode
fromrequests.exceptions importRequestException
importrequests
'''
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
'''
defget_page_index(offset, keyword):
headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
data = {
'format': 'json',
'offset': offset,
'keyword': keyword,
'autoload': 'true',
'count': 20,
'cur_tab': 1,
'from': 'search_tab',
'pd': 'synthesis',
}
url = 'https://www.toutiao.com/search_content/?'+ urlencode(data)
response = requests.get(url, headers=headers);
try:
ifresponse.status_code == 200:
returnresponse.text
returnNone
exceptRequestException:
print( '请求索引页失败')
returnNone
defmain:
html = get_page_index( 0, '街拍')
print(html)
if__name__== '__main__':
main
2.抓取详情页内容
解析返回结果,得到详情页的链接,并进一步抓取详情页的信息。
获取页面网址:def parse_page_index(html):
data= json.loads(html)
ifdataand 'data'indata.keys:
foritem indata. get( 'data'):
yield item. get( 'article_url')
单个页面代码:
defget_page_detail(url):
headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
try:
response = requests.get(url, headers=headers)
ifresponse.status_code == 200:
returnresponse.text
returnNone
exceptRequestException:
print( '请求详情页页失败')
returnNone
图片地址def parse_page_detail(html,url):
soup = BeautifulSoup(html, 'lxml')
title = soup.select( 'title')[ 0].get_text
images_pattern = re.compile( 'gallery: JSON.parse((.*?))', re.S)
result = re.search(images_pattern, html)
ifresult:
data= json.loads(result.group( 1))
data= json.loads( data) #将字符串转为dict,因为报错了
ifdataand 'sub_images'indata.keys:
sub_images = data. get( 'sub_images')
images = [item. get( 'url') foritem insub_images]
forimage inimages: download_image(image)
return{
'title': title,
'images':images,
'url':url
}
3.下载图片与保存数据库
将图片下载到本地,并把页面信息及图片URL保存到MongDB。
# 存到数据库
defsave_to_mongo(result):
ifdb[MONGO_TABLE].insert(result):
print( '存储到MongoDb成功', result)
returnTrue
returnFalse
# 下载图片
defdownload_image(url):
print( '正在下载',url)
headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537. 36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
try:
response = requests.get(url, headers=headers)
ifresponse.status_code == 200:
save_image(response.content)
returnNone
exceptRequestException:
print( '请求图片失败', url)
returnNone
defsave_image(content):
file_path = '{0}/{1}.{2}'.format(os.getcwd,md5(content).hexdigest, 'jpg')
ifnotos.path.exists(file_path):
withopen(file_path, 'wb') asf:
f.write(content)
4.开启循环及多线程
对多页内容遍历,开启多线程提高抓取速度。
groups = [x* 20forx inrange(GROUP_START, GROUP_END+1)]
pool = Pool
pool.map(main,groups)
完整代码:fromurllib.parse importurlencode
fromrequests.exceptions importRequestException
frombs4 importBeautifulSoup
fromhashlib importmd5
frommultiprocessing importPool
fromconfig import*
importpymongo
importrequests
importjson
importre
importos
'''
遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!
'''
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
defget_page_index(offset, keyword):
headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
data = { 'format': 'json', 'offset': offset, 'keyword': keyword, 'autoload': 'true', 'count': 20, 'cur_tab': 1, 'from': 'search_tab', 'pd': 'synthesis'}
url = 'https://www.toutiao.com/search_content/?'+ urlencode(data)
try:
response = requests.get(url, headers=headers)
ifresponse.status_code == 200:
returnresponse.text
returnNone
exceptRequestException:
print( '请求索引页失败')
returnNone
defparse_page_index(html):
data = json.loads(html)
ifdata and'data'indata.keys:
foritem indata.get( 'data'):
yielditem.get( 'article_url')
defget_page_detail(url):
headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
try:
response = requests.get(url, headers=headers)
ifresponse.status_code == 200:
returnresponse.text
returnNone
exceptRequestException:
print( '请求详情页页失败')
returnNone
defparse_page_detail(html,url):
soup = BeautifulSoup(html, 'lxml')
title = soup.select( 'title')[ 0].get_text
images_pattern = re.compile( 'gallery: JSON.parse((.*?))', re.S)
result = re.search(images_pattern, html)
ifresult:
data = json.loads(result.group( 1))
data = json.loads(data) #将字符串转为dict,因为报错了
ifdata and'sub_images'indata.keys:
sub_images = data.get( 'sub_images')
images = [item.get( 'url') foritem insub_images]
forimage inimages: download_image(image)
return{
'title': title,
'images':images,
'url':url
}
defsave_to_mongo(result):
ifdb[MONGO_TABLE].insert(result):
print( '存储到MongoDb成功', result)
returnTrue
returnFalse
defdownload_image(url):
print( '正在下载',url)
headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537. 36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
try:
response = requests.get(url, headers=headers)
ifresponse.status_code == 200:
save_image(response.content)
returnNone
exceptRequestException:
print( '请求图片失败', url)
returnNone
defsave_image(content):
file_path = '{0}/{1}.{2}'.format(os.getcwd,md5(content).hexdigest, 'jpg')
ifnotos.path.exists(file_path):
withopen(file_path, 'wb') asf:
f.write(content)
defmain(offset):
html = get_page_index(offset,KEYWORD)
forurl inparse_page_index(html):
html = get_page_detail(url)
ifhtml:
result = parse_page_detail(html,url)
ifisinstance(result,dict):
save_to_mongo(result)
if__name__== '__main__':
groups = [x* 20forx inrange(GROUP_START, GROUP_END+ 1)]
pool = Pool
pool.map(main,groups)
config.py
MONGO_URL = 'localhost'
MONGO_DB = 'toutiao'
MONGO_TABLE = 'jiepai'
GROUP_START = 1
GROUP_END = 20
KEYWORD = '街拍'
责任编辑: