task1:
import requests
import csv
from bs4 import BeautifulSoup
import re
user_agent = 'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edg/15.15063'
headers = {'User-Agent':user_agent}
policies = requests.get("#国家政策信息网网址")
policies.encoding = policies.apparent_encoding
p = BeautifulSoup(policies.text,'html.parser')
contents = p.find_all(href=re.compile('content'))
rows = []
for content in contents:
href = content.get('href')
row = ('国务院', content.string, href)
rows.append(row)
header = ['发文部门','标题','链接']
with open('policies.csv','w',encoding='gb18030') as f:
f_csv = csv.writer(f)
f_csv.writerow(header)
f_csv.writerows(rows)
print('\n\n最新的信息获取完成\n\n')
task2
import requests
import csv
from bs4 import BeautifulSoup
import re
user_agent = 'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edg/15.15063'
headers = {'User-Agent':user_agent}
policies = requests.get("# http")
policies.encoding = policies.apparent_encoding
p = BeautifulSoup(policies.text,'html.parser')
contents = p.find_all(href=re.compile('content'))
rows = []
for content in contents:
href = content.get('href')
row = (content.get_text() , href)
rows.append(row)
for i in range(5):
print(rows[i])
# header = ['发文部门','标题','链接']
# with open('policies.csv','w',encoding='gb18030') as f:
# f_csv = csv.writer(f)
# f_csv.writerow(header)
# f_csv.writerows(rows)
print('\n\n最新的信息获取完成\n\n')
task3
# task2
import requests
import csv
from bs4 import BeautifulSoup
import re
user_agent = 'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edg/15.15063'
headers = {'User-Agent': user_agent}
policies = requests.get("# http")
policies.encoding = policies.apparent_encoding
p = BeautifulSoup(policies.text, 'html.parser')
contents = p.find_all('h4')
xuehao = []
rows = []
# for content in contents:
# row = ('国务院', content.a.get_text(),content.span.get_text())
# rows.append(row)
for i in range(len(contents)):
href = contents[i].find('a').attrs['href']
xuehao.append(href)
xuehao_list = [xuehao[i] for i in range(len(xuehao)) if '6' in str(i)]
def get_tocontent(url):
user_agent = 'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edg/15.15063'
headers = {'User-Agent': user_agent}
policies = requests.get(url)
policies.encoding = policies.apparent_encoding
p = BeautifulSoup(policies.text, 'html.parser')
news_con = p.find('table')#找到新闻内容需要信息所在的table
if news_con is None:
title = p.find('h1').get_text()
time = p.find('div',class_='pages-date').get_text().split(' ')[0]
laiyuan = p.find('span',class_='font').get_text()
row = ('None','None',laiyuan,time,title,'None',time)
else:
suoyin = news_con.findAll('tr')[0].findAllNext('td')[2].get_text()
zhuti = news_con.findAll('tr')[1].findAllNext('td')[3].get_text()
fawen = news_con.findAll('tr')[2].findAllNext('td')[1].get_text()
chengwen =news_con.findAll('tr')[2].findAllNext('td')[3].get_text()
biaoti = news_con.findAll('tr')[2].findAllNext('td')[5].get_text()
zihao = news_con.findAll('tr')[4].findAllNext('td')[1].get_text()
fabu = news_con.findAll('tr')[4].findAllNext('td')[3].get_text()
row = (suoyin,zhuti,fawen,chengwen,biaoti,zihao,fabu)
global rows
rows.append(row)
# get_tocontent('')
for i in range(len(xuehao_list)):
if xuehao_list[i].startswith('http'):
get_tocontent(xuehao_list[i])
else:
url = 'http://www.gov.cn' + xuehao_list[i]
get_tocontent(url)
for j in range(len(rows)):
print(rows[j])
headers = ['学号尾号为6:索引号','主题分类','发文机关','成文日期','标题','发文字号','发布日期']
with open('task2.csv','w',encoding='gb18030') as f:
f_csv = csv.writer(f)
f_csv.writerow(headers)
f_csv.writerows(rows)
print('\n\n最新的信息获取完成\n\n')