STITP项目记录Period3

这段时间主要写出了针对CNN News China的爬虫,可以爬下这个网站关于中国报道的标题,时间,url,以及正文。

import requests
from requests.exceptions import RequestException
import re
import time
from bs4 import BeautifulSoup
import lxml
import json


def get_one_page(url):
	try:
		headers = {
			'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
		}
		response = requests.get(url, headers=headers)
		if response.status_code == 200:
			return response.text
		print('url:' + url + '\nWarning : status_code is not 200!')
		return None
	except RequestException:
		print('url:' + url + '\nWarning : we get an exception!')
		print(str(RequestException.message))
		return None


def write_to_file(content):
    with open('result.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')


def get_content(url, title):
	if str(url)[0] != 'h':
		url = 'http://edition.cnn.com' + url
	html = get_one_page(url)
	soup = BeautifulSoup(html, 'html.parser')
	# print(soup.prettify())
	date = soup.find(name='p', attrs={'class':'update-time'}).get_text()
	containers = soup.find_all(attrs={'class':'zn-body__paragraph speakable'})
	content = ""
	for container in containers:
		content = content + container.get_text()
	print(url + ' done!')
	return date, content

def parse_section_1(section_1):
	div = section_1.find(attrs={'class':'zn__containers'})
	area_1 = div.find(attrs={'class':'column zn__column--idx-0'})
	area_2 = div.find(attrs={'class':'column zn__column--idx-1'})
	title = area_1.find(attrs={'class':'cd__headline-text'}).string
	url = area_1.find(name='a').attrs['href']
	date, content = get_content(url, title)
	opinion = {'type':'opoinion', 'title':title, 'url':url, 'date':date, 'content':content}
	write_to_file(opinion)
	for article in area_2.find_all(name='article'):
		url = article['data-vr-contentbox']
		title = article.find(attrs={'class':'cd__headline-text'}).string
		date, content = get_content(url, title)
		top_stories = {'type':'top stories', 'title':title, 'url':url, 'date':date, 'content':content}
		write_to_file(top_stories)
	# print('parse_section_1 done!')


def parse_section_2(section_2):
	div = section_2.find(attrs={'class':'zn__containers'})
	area_1 = div.find(attrs={'class':'column zn__column--idx-0'})
	area_2 = div.find(attrs={'class':'column zn__column--idx-1'})
	for article in area_1.find_all(name='article'):
		url = article['data-vr-contentbox']
		title = article.find(attrs={'class':'cd__headline-text'}).string
		date, content = get_content(url, title)
		news = {'type':'news and buzz', 'title':title, 'url':url, 'date':date, 'content':content}
		write_to_file(news)
	title = area_2.find(name='span', attrs={'class':'cd__headline-text'}).string
	url = area_2.find(name='a').attrs['href']
	date, content = get_content(url, title)
	in_depth = {'type':'in depth', 'title':title, 'url':url, 'date':date, 'content':content}
	write_to_file(in_depth)
	# print('parse_section_2 done!')

def main():
	url = 'https://edition.cnn.com/china'
	html = get_one_page(url)
	soup = BeautifulSoup(html, 'html.parser')
	# print(soup.prettify())
	sections = soup.find_all(name='section')
	parse_section_1(sections[0])
	parse_section_2(sections[1])


if __name__ == '__main__':
	main()

可惜的是,这份代码依然存在一些问题,就是不能爬取下单篇报道的所有正文。
只怪它的html代码太过复杂。。。
不水了,继续改。

你可能感兴趣的:(自然语言处理)