python爬虫爬取今日头条_Python爬虫案例:分析Ajax请求并抓取今日头条街拍图片...

原标题:Python爬虫案例:分析Ajax请求并抓取今日头条街拍图片

1.抓取索引页内容

利用requests请求目标站点,得到索引网页HTML代码,返回结果。

fromurllib.parse importurlencode

fromrequests.exceptions importRequestException

importrequests

'''

遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!

'''

defget_page_index(offset, keyword):

headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}

data = {

'format': 'json',

'offset': offset,

'keyword': keyword,

'autoload': 'true',

'count': 20,

'cur_tab': 1,

'from': 'search_tab',

'pd': 'synthesis',

}

url = 'https://www.toutiao.com/search_content/?'+ urlencode(data)

response = requests.get(url, headers=headers);

try:

ifresponse.status_code == 200:

returnresponse.text

returnNone

exceptRequestException:

print( '请求索引页失败')

returnNone

defmain:

html = get_page_index( 0, '街拍')

print(html)

if__name__== '__main__':

main

2.抓取详情页内容

解析返回结果,得到详情页的链接,并进一步抓取详情页的信息。

获取页面网址:def parse_page_index(html):

data= json.loads(html)

ifdataand 'data'indata.keys:

foritem indata. get( 'data'):

yield item. get( 'article_url')

单个页面代码:

defget_page_detail(url):

headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}

try:

response = requests.get(url, headers=headers)

ifresponse.status_code == 200:

returnresponse.text

returnNone

exceptRequestException:

print( '请求详情页页失败')

returnNone

图片地址def parse_page_detail(html,url):

soup = BeautifulSoup(html, 'lxml')

title = soup.select( 'title')[ 0].get_text

images_pattern = re.compile( 'gallery: JSON.parse((.*?))', re.S)

result = re.search(images_pattern, html)

ifresult:

data= json.loads(result.group( 1))

data= json.loads( data) #将字符串转为dict,因为报错了

ifdataand 'sub_images'indata.keys:

sub_images = data. get( 'sub_images')

images = [item. get( 'url') foritem insub_images]

forimage inimages: download_image(image)

return{

'title': title,

'images':images,

'url':url

}

3.下载图片与保存数据库

将图片下载到本地,并把页面信息及图片URL保存到MongDB。

# 存到数据库

defsave_to_mongo(result):

ifdb[MONGO_TABLE].insert(result):

print( '存储到MongoDb成功', result)

returnTrue

returnFalse

# 下载图片

defdownload_image(url):

print( '正在下载',url)

headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537. 36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}

try:

response = requests.get(url, headers=headers)

ifresponse.status_code == 200:

save_image(response.content)

returnNone

exceptRequestException:

print( '请求图片失败', url)

returnNone

defsave_image(content):

file_path = '{0}/{1}.{2}'.format(os.getcwd,md5(content).hexdigest, 'jpg')

ifnotos.path.exists(file_path):

withopen(file_path, 'wb') asf:

f.write(content)

4.开启循环及多线程

对多页内容遍历,开启多线程提高抓取速度。

groups = [x* 20forx inrange(GROUP_START, GROUP_END+1)]

pool = Pool

pool.map(main,groups)

完整代码:fromurllib.parse importurlencode

fromrequests.exceptions importRequestException

frombs4 importBeautifulSoup

fromhashlib importmd5

frommultiprocessing importPool

fromconfig import*

importpymongo

importrequests

importjson

importre

importos

'''

遇到不懂的问题?Python学习交流群:821460695满足你的需求,资料都已经上传群文件,可以自行下载!

'''

client = pymongo.MongoClient(MONGO_URL)

db = client[MONGO_DB]

defget_page_index(offset, keyword):

headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}

data = { 'format': 'json', 'offset': offset, 'keyword': keyword, 'autoload': 'true', 'count': 20, 'cur_tab': 1, 'from': 'search_tab', 'pd': 'synthesis'}

url = 'https://www.toutiao.com/search_content/?'+ urlencode(data)

try:

response = requests.get(url, headers=headers)

ifresponse.status_code == 200:

returnresponse.text

returnNone

exceptRequestException:

print( '请求索引页失败')

returnNone

defparse_page_index(html):

data = json.loads(html)

ifdata and'data'indata.keys:

foritem indata.get( 'data'):

yielditem.get( 'article_url')

defget_page_detail(url):

headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}

try:

response = requests.get(url, headers=headers)

ifresponse.status_code == 200:

returnresponse.text

returnNone

exceptRequestException:

print( '请求详情页页失败')

returnNone

defparse_page_detail(html,url):

soup = BeautifulSoup(html, 'lxml')

title = soup.select( 'title')[ 0].get_text

images_pattern = re.compile( 'gallery: JSON.parse((.*?))', re.S)

result = re.search(images_pattern, html)

ifresult:

data = json.loads(result.group( 1))

data = json.loads(data) #将字符串转为dict,因为报错了

ifdata and'sub_images'indata.keys:

sub_images = data.get( 'sub_images')

images = [item.get( 'url') foritem insub_images]

forimage inimages: download_image(image)

return{

'title': title,

'images':images,

'url':url

}

defsave_to_mongo(result):

ifdb[MONGO_TABLE].insert(result):

print( '存储到MongoDb成功', result)

returnTrue

returnFalse

defdownload_image(url):

print( '正在下载',url)

headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537. 36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}

try:

response = requests.get(url, headers=headers)

ifresponse.status_code == 200:

save_image(response.content)

returnNone

exceptRequestException:

print( '请求图片失败', url)

returnNone

defsave_image(content):

file_path = '{0}/{1}.{2}'.format(os.getcwd,md5(content).hexdigest, 'jpg')

ifnotos.path.exists(file_path):

withopen(file_path, 'wb') asf:

f.write(content)

defmain(offset):

html = get_page_index(offset,KEYWORD)

forurl inparse_page_index(html):

html = get_page_detail(url)

ifhtml:

result = parse_page_detail(html,url)

ifisinstance(result,dict):

save_to_mongo(result)

if__name__== '__main__':

groups = [x* 20forx inrange(GROUP_START, GROUP_END+ 1)]

pool = Pool

pool.map(main,groups)

config.py

MONGO_URL = 'localhost'

MONGO_DB = 'toutiao'

MONGO_TABLE = 'jiepai'

GROUP_START = 1

GROUP_END = 20

KEYWORD = '街拍'

责任编辑:

你可能感兴趣的:(python爬虫爬取今日头条)