一、背景
近期在学习爬虫,利用Requests模块获取页面,BeautifulSoup来获取需要的内容,最后利用xlsxwriter模块讲内容保存至excel,在此记录一下,后续可举一反三,利用其抓取其他内容持久和存储到文件内,或数据库等。
二、代码
编写了两个模块,geturl3和getexcel3,最后在main内调用
git源码地址
geturl3.py
代码内容如下:
#!/bin/env python
# -*- coding:utf-8 -*-
# @Author : kaliarch
import requests
from bs4 import BeautifulSoup
class get_urldic:
#获取搜索关键字
def get_url(self):
urlList = []
first_url = 'https://blog.51cto.com/search/result?q='
after_url = '&type=&page='
try:
search = input("Please input search name:")
page = int(input("Please input page:"))
except Exception as e:
print('Input error:',e)
exit()
for num in range(1,page+1):
url = first_url + search + after_url + str(num)
urlList.append(url)
print("Please wait....")
return urlList,search
#获取网页文件
def get_html(self,urlList):
response_list = []
for r_num in urlList:
request = requests.get(r_num)
response = request.content
response_list.append(response)
return response_list
#获取blog_name和blog_url
def get_soup(self,html_doc):
result = {}
for g_num in html_doc:
soup = BeautifulSoup(g_num,'html.parser')
context = soup.find_all('a',class_='m-1-4 fl')
for i in context:
title=i.get_text()
result[title.strip()]=i['href']
return result
if __name__ == '__main__':
blog = get_urldic()
urllist, search = blog.get_url()
html_doc = blog.get_html(urllist)
result = blog.get_soup(html_doc)
for k,v in result.items():
print('search blog_name is:%s,blog_url is:%s' % (k,v))
getexcel3.py
代码内容如下:
#!/bin/env python
# -*- coding:utf-8 -*-
# @Author : kaliarch
import xlsxwriter
class create_excle:
def __init__(self):
self.tag_list = ["blog_name", "blog_url"]
def create_workbook(self,search=" "):
excle_name = search + '.xlsx'
#定义excle名称
workbook = xlsxwriter.Workbook(excle_name)
worksheet_M = workbook.add_worksheet(search)
print('create %s....' % excle_name)
return workbook,worksheet_M
def col_row(self,worksheet):
worksheet.set_column('A:A', 12)
worksheet.set_row(0, 17)
worksheet.set_column('A:A',58)
worksheet.set_column('B:B', 58)
def shell_format(self,workbook):
#表头格式
merge_format = workbook.add_format({
'bold': 1,
'border': 1,
'align': 'center',
'valign': 'vcenter',
'fg_color': '#FAEBD7'
})
#标题格式
name_format = workbook.add_format({
'bold': 1,
'border': 1,
'align': 'center',
'valign': 'vcenter',
'fg_color': '#E0FFFF'
})
#正文格式
normal_format = workbook.add_format({
'align': 'center',
})
return merge_format,name_format,normal_format
#写入title和列名
def write_title(self,worksheet,search,merge_format):
title = search + "搜索结果"
worksheet.merge_range('A1:B1', title, merge_format)
print('write title success')
def write_tag(self,worksheet,name_format):
tag_row = 1
tag_col = 0
for num in self.tag_list:
worksheet.write(tag_row,tag_col,num,name_format)
tag_col += 1
print('write tag success')
#写入内容
def write_context(self,worksheet,con_dic,normal_format):
row = 2
for k,v in con_dic.items():
if row > len(con_dic):
break
col = 0
worksheet.write(row,col,k,normal_format)
col+=1
worksheet.write(row,col,v,normal_format)
row+=1
print('write context success')
#关闭excel
def workbook_close(self,workbook):
workbook.close()
if __name__ == '__main__':
print('This is create excel mode')
main.py
代码内容如下:
#!/bin/env python
# -*- coding:utf-8 -*-
# @Author : kaliarch
import geturl3
import getexcel3
#获取url字典
def get_dic():
blog = geturl3.get_urldic()
urllist, search = blog.get_url()
html_doc = blog.get_html(urllist)
result = blog.get_soup(html_doc)
return result,search
#写入excle
def write_excle(urldic,search):
excle = getexcel3.create_excle()
workbook, worksheet = excle.create_workbook(search)
excle.col_row(worksheet)
merge_format, name_format, normal_format = excle.shell_format(workbook)
excle.write_title(worksheet,search,merge_format)
excle.write_tag(worksheet,name_format)
excle.write_context(worksheet,urldic,normal_format)
excle.workbook_close(workbook)
def main():
url_dic ,search_name = get_dic()
write_excle(url_dic,search_name)
if __name__ == '__main__':
main()
三、效果展示
运行代码,填写搜索的关键字,及搜索多少页
查看会生成一个以搜索关键字命名的excel,打开写入的内容
利用其就可以搜索并保持自己需要的51CTO推荐博客,可以多搜索几个