STITP项目记录Period4

这段时间将爬虫完善了一下,现在可以爬取全文了。

import requests
from requests.exceptions import RequestException
import re
import time
from bs4 import BeautifulSoup
import lxml
import json


def get_one_page(url):
	try:
		headers = {
			'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
		}
		response = requests.get(url, headers=headers)
		if response.status_code == 200:
			return response.text
		print('url:' + url + '\nWarning : status_code is not 200!')
		return None
	except RequestException:
		print('url:' + url + '\nWarning : we get an exception!')
		print(str(RequestException.message))
		return None


def write_to_file(content):
    with open('result.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')


def get_content(url, title):
	if str(url)[0] != 'h':
		url = 'http://edition.cnn.com' + url
	html = get_one_page(url)
	soup = BeautifulSoup(html, 'html.parser')
	# print(soup.prettify())
	date = soup.find(name='p', attrs={'class':'update-time'})
	try:
		date = date.get_text()
	except:
		date = ""
	divs = soup.find(name='section', attrs={'id':'body-text'})
	content = ""
	try:
		divs = divs.find(name='div', attrs={'class':'l-container'})
		abstract = divs.find(attrs={'class':'el__leafmedia el__leafmedia--sourced-paragraph'}).get_text()
		for div in divs.find_all(name='div', attrs={'class':'zn-body__paragraph speakable'}):
			content = content + div.get_text()
		for div in divs.find_all(name='div', attrs={'class':'zn-body__paragraph'}):
			content = content + div.get_text()
		print(url + ' done!')
	except:
		abstract = ""
	return date, abstract, content

def parse_section_1(section_1):
	div = section_1.find(attrs={'class':'zn__containers'})
	area_1 = div.find(attrs={'class':'column zn__column--idx-0'})
	area_2 = div.find(attrs={'class':'column zn__column--idx-1'})
	title = area_1.find(attrs={'class':'cd__headline-text'}).string
	url = area_1.find(name='a').attrs['href']
	date, abstract, content = get_content(url, title)
	opinion = {'type':'important', 'title':title, 'url':url, 'date':date, 'abstract': abstract, 'content':content}
	write_to_file(opinion)
	for article in area_2.find_all(name='article'):
		url = article['data-vr-contentbox']
		title = article.find(attrs={'class':'cd__headline-text'}).string
		date, abstract, content = get_content(url, title)
		top_stories = {'type':'top stories', 'title':title, 'url':url, 'date':date, 'abstract':abstract, 'content':content}
		write_to_file(top_stories)
	# print('parse_section_1 done!')


def parse_section_2(section_2):
	div = section_2.find(attrs={'class':'zn__containers'})
	area_1 = div.find(attrs={'class':'column zn__column--idx-0'})
	area_2 = div.find(attrs={'class':'column zn__column--idx-1'})
	for article in area_1.find_all(name='article'):
		url = article['data-vr-contentbox']
		title = article.find(attrs={'class':'cd__headline-text'}).string
		date, abstract, content = get_content(url, title)
		news = {'type':'news and buzz', 'title':title, 'url':url, 'date':date, 'abstract':abstract, 'content':content}
		write_to_file(news)
	title = area_2.find(name='span', attrs={'class':'cd__headline-text'}).string
	url = area_2.find(name='a').attrs['href']
	date, abstract, content = get_content(url, title)
	in_depth = {'type':'in depth', 'title':title, 'url':url, 'date':date, 'abstract':abstract, 'content':content}
	write_to_file(in_depth)
	# print('parse_section_2 done!')

def main():
	url = 'https://edition.cnn.com/china'
	html = get_one_page(url)
	soup = BeautifulSoup(html, 'html.parser')
	# print(soup.prettify())
	sections = soup.find_all(name='section')
	parse_section_1(sections[0])
	parse_section_2(sections[1])


if __name__ == '__main__':
	main()

说实话最近有点咸鱼了,项目一点进展都没有,很难受。

你可能感兴趣的:(自然语言处理)