爬虫作业-

task1:

import requests
import csv
from bs4 import BeautifulSoup
import re

user_agent = 'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edg/15.15063'
headers = {'User-Agent':user_agent}
policies = requests.get("#国家政策信息网网址")
policies.encoding = policies.apparent_encoding

p = BeautifulSoup(policies.text,'html.parser')
contents = p.find_all(href=re.compile('content'))
rows = []

for content in contents:
    href = content.get('href')
    row = ('国务院', content.string, href)
    rows.append(row)

header = ['发文部门','标题','链接']
with open('policies.csv','w',encoding='gb18030') as f:
    f_csv = csv.writer(f)
    f_csv.writerow(header)
    f_csv.writerows(rows)

print('\n\n最新的信息获取完成\n\n')

task2

import requests
import csv
from bs4 import BeautifulSoup
import re

user_agent = 'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edg/15.15063'
headers = {'User-Agent':user_agent}
policies = requests.get("# http")
policies.encoding = policies.apparent_encoding

p = BeautifulSoup(policies.text,'html.parser')
contents = p.find_all(href=re.compile('content'))
rows = []

for content in contents:
    href = content.get('href')
    row = (content.get_text() , href)
    rows.append(row)

for i in range(5):
    print(rows[i])

# header = ['发文部门','标题','链接']
# with open('policies.csv','w',encoding='gb18030') as f:
#     f_csv = csv.writer(f)
#     f_csv.writerow(header)
#     f_csv.writerows(rows)

print('\n\n最新的信息获取完成\n\n')

task3

# task2

import requests
import csv
from bs4 import BeautifulSoup
import re

user_agent = 'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edg/15.15063'
headers = {'User-Agent': user_agent}
policies = requests.get("# http")
policies.encoding = policies.apparent_encoding
p = BeautifulSoup(policies.text, 'html.parser')
contents = p.find_all('h4')
xuehao = []
rows = []

# for content in contents:
#     row = ('国务院', content.a.get_text(),content.span.get_text())
#     rows.append(row)
for i in range(len(contents)):
    href = contents[i].find('a').attrs['href']
    xuehao.append(href)
xuehao_list = [xuehao[i] for i in range(len(xuehao)) if '6' in str(i)]




def get_tocontent(url):
    user_agent = 'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edg/15.15063'
    headers = {'User-Agent': user_agent}
    policies = requests.get(url)
    policies.encoding = policies.apparent_encoding
    p = BeautifulSoup(policies.text, 'html.parser')
    news_con = p.find('table')#找到新闻内容需要信息所在的table
    if news_con is None:
        title = p.find('h1').get_text()
        time = p.find('div',class_='pages-date').get_text().split(' ')[0]
        laiyuan = p.find('span',class_='font').get_text()
        row = ('None','None',laiyuan,time,title,'None',time)
    else:
        suoyin = news_con.findAll('tr')[0].findAllNext('td')[2].get_text()
        zhuti = news_con.findAll('tr')[1].findAllNext('td')[3].get_text()
        fawen = news_con.findAll('tr')[2].findAllNext('td')[1].get_text()
        chengwen =news_con.findAll('tr')[2].findAllNext('td')[3].get_text()
        biaoti = news_con.findAll('tr')[2].findAllNext('td')[5].get_text()
        zihao = news_con.findAll('tr')[4].findAllNext('td')[1].get_text()
        fabu = news_con.findAll('tr')[4].findAllNext('td')[3].get_text()
        row = (suoyin,zhuti,fawen,chengwen,biaoti,zihao,fabu)
    global rows
    rows.append(row)
# get_tocontent('')

for i in range(len(xuehao_list)):
    if xuehao_list[i].startswith('http'):
        get_tocontent(xuehao_list[i])
    else:
        url = 'http://www.gov.cn' + xuehao_list[i]
        get_tocontent(url)
for j in range(len(rows)):
    print(rows[j])
headers = ['学号尾号为6:索引号','主题分类','发文机关','成文日期','标题','发文字号','发布日期']
with open('task2.csv','w',encoding='gb18030') as f:
    f_csv = csv.writer(f)
    f_csv.writerow(headers)
    f_csv.writerows(rows)
print('\n\n最新的信息获取完成\n\n')

你可能感兴趣的:(爬虫学习,爬虫,python)