在学python,记录一些学习阶段的(丑陋的)爬虫成果。
#导入变量
import requests
from bs4 import BeautifulSoup
import time
from time import sleep
import pandas as pd
import re
import tqdm as tqdm
import os
from pandas import DataFrame
import csv
#定义主要变量
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
"content-type": "application/json; charset=UTF-8",
"Connection": "keep-alive"
}
url="http://www.hprc.org.cn/wxzl/wxysl/lczf/"
def content(headers,url):#输入请求头、URL
headers = headers
url=url
content = requests.get(url)#requeats方法
content.encoding = 'utf8'#encoding编码
content = content.text
soup = BeautifulSoup(content, 'html.parser')#借用beautifulsoup解析方式
links = soup.find_all('td', {'class', 'bl'})#这里应用的是find all方法,也可以用select ‘class’=‘bl’要注意是否是唯一对应
return links#输出链接
links=content({'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',"content-type": "application/json; charset=UTF-8","Connection": "keep-alive"},"http://www.hprc.org.cn/wxzl/wxysl/lczf/" )
print(len(links))#这里也可以输出一个links列表
#获取分支页文章内容链接,获得所有文章链接,并清洗链接格式
def try_req(path):#根据网址格式定义一个获取分页的函数,如果网页有“下一页”,则_n+1,如果没有,则反馈“404”,停止循环
linew=[]
for i in range(1,15):#range(15)穷尽最大页数,也可以range(20)、range(100)…
number=i
last='_'+str(i)
newpath=str(path[:-5]+last+'.html')
if str(requests.get(newpath))=='':
i=i+1
linew.append(newpath)
else:
break
return linew
class find_all_path():#查找所有文章内容的网页链接
def __init__(self):
self.path = links
def find_path(self):#清洗links,得到标准的网址格式
path_list=[]
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',"content-type": "application/json; charset=UTF-8","Connection": "keep-alive"}
url="http://www.hprc.org.cn/wxzl/wxysl/lczf/"
for i in range(len(self.path)):
num1=str(self.path[i]).find('href="')+8#利用str定位方法,定位唯一关键词
num2=str(self.path[i]).find('.html')
path= url + str(self.path[i])[num1:num2] +'.html'#截取网址
path_list.append(path)
return path_list#返回网址列表
def all_path(self):#借助之前定义的函数获取所有文章内容(包括分页、下一页)的网址列表
list2=[]
for i in self.find_path():
paths=try_req(i)
list2.append(str(paths))
return list2
def clean(self):#清洗网址列表,得到格式工整的列表
list3=[]
for i in self.all_path():
if i == '[]':
pass
else:
list3.append(i)
j0=" ".join(list3)
j1 =j0.split(',')
j2="".join(j1).replace('[','').replace(']','').replace("'","")
j3=j2.split(' ')
list_all=j3+self.find_path()
return list_all
if __name__ == '__main__':
a = find_all_path()
a.clean()
#整个代码写完之后要重启变量检验一遍
#获取所有报告内容的网址链接
paths=a.clean()
print(paths)
#抓取文章内容
def new_content(headers,url):
headers = headers
url=url
content = requests.get(url)
content.encoding = 'utf8'
content = content.text
soup = BeautifulSoup(content, 'html.parser')
head = str(soup.find('span', {'class', 'huang16c'}).text)#find方法找到第一个或者唯一一个,find.text获取文本内容
text = str(soup.find('td', {'class', 'hui14_30'}).text)
article=(head,text)
return article
#文章内容清洗
def clean_text(article):
textnew1 = re.sub('([a-zA-Z*=_:;<>/%-])','', article[1])#正则表达式清洗,这一步可能会删去一些不必被删的内容,可以适当调整
textnew = textnew1.replace('\n','\t').replace('楷体','').replace('年月日','').replace('\u3000','').replace('"','').replace(' ','').replace('\t\t','\t').replace('\xa0','')#再次清洗
text = article[0] +':'+ textnew
return text
#将政府工作报告写入文件
def writing(paths):
for k,i in enumerate(paths):
time.sleep(1)
text1=clean_text(new_content(headers,i))
text=str(i)+'\n'+text1
text2=text1[:text.find(':')][:4]
with open('/Users/yunjiefei/Desktop/new/'+text2+'_'+str(k)+'.txt', 'w', encoding = 'utf-8') as f:
#为了防止重复命名,命名方式略微丑陋。也可以采取网址命名方式。
f.write(text)
#print(k)#进度可视化
#得到一个文件名为年份(+一个为了防止重复命名的无意义的总序号),文件内容为网址+标题+内容的报告集合
#执行写入
writing(paths)
#将我们的文件写入数据框内
file_name = os.listdir(r'/Users/yunjiefei/Desktop/new')
li=[]
li_all=[]
for i in file_name[120:185]:#分次截取列表,要不然会报错,内存超载
time.sleep(1)
files= open(r'/Users/yunjiefei/Desktop/new'+'/'+i,'r',encoding='utf8',errors='ignore')#会有一个macos的隐藏文件,忽略它
filereader=files.read()
filecontent=filereader[filereader.find('年政府工作报告:')+9:]
filelink=filereader[:filereader.find('.html')+5]
filetitle=str(i)[:4]+'年政府工作报告'
li=[i,filecontent,filelink,filetitle]
li_all.append(li)
print(li_all)#构造双层列表
#写入数据框并保存
data=pd.DataFrame(li_all,columns = ["filename","content",'link','title'])
#data2=pd.DataFrame(li_all,columns = ["filename","content",'link','title'])#根据分割的数据分别保存进dataframe
#data3=pd.DataFrame(li_all,columns = ["filename","content",'link','title'])
#data4=pd.DataFrame(li_all,columns = ["filename","content",'link','title'])
data_all=data.append([data2,data3,data4])#append直接纵向合并
data_all.index=data_all['filename']
del data_all['filename']
data_all.to_excel('/Users/yunjiefei/Desktop/reports.xlsx')#最后将整体的政府工作报告保存入文件
#后续利用excel根据需要整理表格即可